Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
bc5b84b1
Commit
bc5b84b1
authored
Dec 04, 2023
by
Artur Wojcik
Browse files
Merge branch 'develop' into uif2-initial
parents
acef6cc7
bc4bf9bd
Changes
66
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
899 additions
and
73 deletions
+899
-73
example/64_tensor_transforms/tensor_transform_wrapper.hpp
example/64_tensor_transforms/tensor_transform_wrapper.hpp
+425
-0
include/ck/ck.hpp
include/ck/ck.hpp
+3
-0
include/ck/stream_config.hpp
include/ck/stream_config.hpp
+2
-2
include/ck/tensor_operation/gpu/device/device_base.hpp
include/ck/tensor_operation/gpu/device/device_base.hpp
+3
-1
include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp
...ration/gpu/device/impl/device_batchnorm_backward_impl.hpp
+3
-1
include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
...eration/gpu/device/impl/device_batchnorm_forward_impl.hpp
+3
-1
include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl_obsolete.hpp
...pu/device/impl/device_batchnorm_forward_impl_obsolete.hpp
+3
-1
include/ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp
..._operation/gpu/device/impl/device_elementwise_3d_impl.hpp
+7
-0
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
...ce/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
+3
-1
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_lds_direct_load.hpp
.../device/impl/device_gemm_xdl_cshuffle_lds_direct_load.hpp
+3
-1
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
...sor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
+3
-1
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
...tion/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
+5
-2
include/ck/tensor_operation/gpu/device/impl/device_normalization_fwd_splitk_impl.hpp
.../gpu/device/impl/device_normalization_fwd_splitk_impl.hpp
+3
-1
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
...gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
+30
-11
include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v4_direct_load.hpp
...ration/gpu/grid/gridwise_gemm_pipeline_v4_direct_load.hpp
+142
-5
include/ck/utility/amd_buffer_addressing.hpp
include/ck/utility/amd_buffer_addressing.hpp
+10
-0
include/ck/utility/tuple_helper.hpp
include/ck/utility/tuple_helper.hpp
+88
-0
library/src/tensor_operation_instance/gpu/CMakeLists.txt
library/src/tensor_operation_instance/gpu/CMakeLists.txt
+146
-43
library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f16_f16_f16_mk_nk_mn_instance.cpp
...shuffle_lds_direct_load_f16_f16_f16_mk_nk_mn_instance.cpp
+15
-1
library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_kn_mn_instance.cpp
...shuffle_lds_direct_load_f32_f32_f32_km_kn_mn_instance.cpp
+2
-1
No files found.
example/64_tensor_transforms/tensor_transform_wrapper.hpp
0 → 100644
View file @
bc5b84b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/ck.hpp"
#include "ck/utility/number.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/tuple_helper.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/utility/sequence_helper.hpp"
#include "ck/utility/is_detected.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
namespace
ck
{
namespace
tensor_transform_wrapper
{
/**
* \brief Layout wrapper
*
* \details
* Layout wrapper that performs the tensor descriptor logic.
*
* \tparam Shape Tuple of Number<> (for compile-time layout) or index_t
* (dynamic layout). It is possible to pass nested shapes
* (e.g. ((4, 2), 2)), nested dimensions are merged.
* \tparam Strides Tuple of Number<> (for compile-time layout) or index_t
* (dynamic layout). Stride tuple should be nested if shape tuple is
* nested.
*/
template
<
typename
Shape
,
typename
Strides
=
Tuple
<
>
>
struct
Layout
{
private:
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
template
<
typename
T
>
using
is_tuple
=
decltype
(
std
::
declval
<
T
&>
().
IsTuple
());
// Generate packed (column-major) strides if not passed
template
<
typename
...
Ts
>
__host__
__device__
constexpr
static
auto
GenerateColumnMajorPackedStrides
(
const
Tuple
<
Ts
...
>&
tuple
)
{
return
generate_tuple
(
[
&
](
auto
i
)
{
if
constexpr
(
i
.
value
==
0
)
{
return
I1
;
}
else
{
return
TupleReduce
<
I0
.
value
,
i
.
value
>
([](
auto
x
,
auto
y
)
{
return
x
*
y
;
},
tuple
);
}
},
Number
<
Tuple
<
Ts
...
>::
Size
()
>
{});
}
// Generate LowerDims in Compile-time for MergeTrasform using passed Type
// If element of Tuple<Ts...> is also tuple, then merge (generate sequence for merge)
// If tuple is element, then pass through (sequence with one element)
template
<
typename
Idx
,
typename
...
Ts
>
__host__
__device__
constexpr
static
auto
GenerateLowerDim
(
const
Tuple
<
Ts
...
>&
)
{
if
constexpr
(
Idx
::
value
==
0
)
{
if
constexpr
(
is_detected
<
is_tuple
,
tuple_element_t
<
Idx
::
value
,
Tuple
<
Ts
...
>>>::
value
)
{
// Return Sequence for the first tuple
constexpr
index_t
merge_nelems
=
decltype
(
UnrollNestedTuple
(
tuple_element_t
<
Idx
::
value
,
Tuple
<
Ts
...
>>
{}))
::
Size
();
using
LowerDimsSequence
=
typename
arithmetic_sequence_gen
<
0
,
merge_nelems
,
1
>::
type
;
return
LowerDimsSequence
::
Reverse
();
}
else
{
// Return first element
return
Sequence
<
0
>
{};
}
}
else
{
// Get previous element using recurence (in compile-time)
using
PreviousSeqT
=
decltype
(
GenerateLowerDim
<
Number
<
Idx
::
value
-
1
>>
(
Tuple
<
Ts
...
>
{}));
const
auto
next_seq_val
=
PreviousSeqT
::
At
(
I0
)
+
1
;
if
constexpr
(
is_detected
<
is_tuple
,
tuple_element_t
<
Idx
::
value
,
Tuple
<
Ts
...
>>>::
value
)
{
constexpr
index_t
merge_nelems
=
decltype
(
UnrollNestedTuple
(
tuple_element_t
<
Idx
::
value
,
Tuple
<
Ts
...
>>
{}))
::
Size
();
using
LowerDimsSequence
=
typename
arithmetic_sequence_gen
<
next_seq_val
,
next_seq_val
+
merge_nelems
,
1
>::
type
;
return
LowerDimsSequence
::
Reverse
();
}
else
{
return
Sequence
<
next_seq_val
>
{};
}
}
}
// Iterate over nested tuples in shape
// Unroll nested tuples to align Tuple<ShapeDims...> to Tuple<IdxDims...>
// Example idx: (1, 1), 1, 1
// Example shape: (2, (2, 2)), 2, (2, 2)
// Unrolled shape: 2, (2, 2), 2, (2, 2)
template
<
typename
...
ShapeDims
,
typename
...
IdxDims
>
__host__
__device__
constexpr
static
auto
UnrollShapeViaIdx
(
const
Tuple
<
ShapeDims
...
>&
shape
,
const
Tuple
<
IdxDims
...
>&
idx
)
{
if
constexpr
(
!
IsNestedTuple
(
Tuple
<
IdxDims
...
>
{}))
{
// Index unrolled to flatten, return shape
return
shape
;
}
else
{
// Iterate over shape tuple elements:
// 1. If corresponding idx element is tuple then return (will be unrolled)
// 2. If no, pack in tuple. It will be restored during unroll.
auto
unrolled_shape_via_idx
=
generate_tuple
(
[
&
](
auto
i
)
{
if
constexpr
(
is_detected
<
is_tuple
,
tuple_element_t
<
i
,
Tuple
<
IdxDims
...
>>>::
value
)
{
return
shape
.
At
(
i
);
}
else
{
return
make_tuple
(
shape
.
At
(
i
));
}
},
Number
<
Tuple
<
IdxDims
...
>::
Size
()
>
{});
// Unroll and process next step
return
UnrollShapeViaIdx
(
UnrollNestedTuple
<
0
,
1
>
(
unrolled_shape_via_idx
),
UnrollNestedTuple
<
0
,
1
>
(
idx
));
}
}
template
<
typename
...
ShapeDims
,
typename
DescriptorToMerge
>
__host__
__device__
constexpr
static
auto
MakeMerge1d
(
const
Tuple
<
ShapeDims
...
>&
shape
,
DescriptorToMerge
&
desc
)
{
// Reverse each element in tuple
using
ReversedUnrolledShape
=
decltype
(
TupleReverse
(
UnrollNestedTuple
(
shape
)));
const
auto
merge_elems
=
ReversedUnrolledShape
{};
// Generate reverted indexes (column major traverse)
using
MergeElemsSequence
=
typename
arithmetic_sequence_gen
<
0
,
ReversedUnrolledShape
::
Size
(),
1
>::
type
;
const
auto
lower_dims
=
make_tuple
(
MergeElemsSequence
::
Reverse
());
const
auto
upper_dims
=
make_tuple
(
Sequence
<
0
>
{});
// Merge to 1d
return
transform_tensor_descriptor
(
desc
,
make_tuple
(
make_merge_transform
(
merge_elems
)),
lower_dims
,
upper_dims
);
}
// Merge nested shape dims
// Input desc shape: 2, 2, 2, 2, 2, 2
// Example idx: 1, 1, 1, 1
// Example shape: 2, (2, 2), 2, (2, 2)
// Merged shape: 2, 4, 2, 4
template
<
typename
...
ShapeDims
,
typename
...
IdxDims
,
typename
DescriptorToMerge
>
__host__
__device__
constexpr
static
auto
MakeMerges
(
const
Tuple
<
ShapeDims
...
>&
shape
,
const
Tuple
<
IdxDims
...
>&
,
DescriptorToMerge
&
desc
)
{
const
auto
transforms
=
generate_tuple
(
[
&
](
auto
i
)
{
// Compare Idx with shape
if
constexpr
(
is_detected
<
is_tuple
,
tuple_element_t
<
i
,
Tuple
<
ShapeDims
...
>>>::
value
&&
!
is_detected
<
is_tuple
,
tuple_element_t
<
i
,
Tuple
<
IdxDims
...
>>>::
value
)
{
// If shape element is tuple and idx element is Number, then merge
// Unroll and reverse tuple to traverse column-major
const
auto
merge_elems
=
TupleReverse
(
UnrollNestedTuple
(
shape
.
At
(
i
)));
return
make_merge_transform
(
merge_elems
);
}
else
{
// If shape element is integer and idx element is tuple, passed idx is wrong
static_assert
(
!
(
!
is_detected
<
is_tuple
,
tuple_element_t
<
i
,
Tuple
<
ShapeDims
...
>>>::
value
&&
is_detected
<
is_tuple
,
tuple_element_t
<
i
,
Tuple
<
IdxDims
...
>>>::
value
),
"Wrong Idx for layout()"
);
// If shape element has the same type as idx element, then pass through
return
make_pass_through_transform
(
shape
.
At
(
i
));
}
},
Number
<
Tuple
<
ShapeDims
...
>::
Size
()
>
{});
const
auto
lower_dims
=
generate_tuple
([
&
](
auto
i
)
{
return
GenerateLowerDim
<
Number
<
i
>>
(
shape
);
},
Number
<
Tuple
<
ShapeDims
...
>::
Size
()
>
{});
const
auto
upper_dims
=
generate_tuple
([
&
](
auto
i
)
{
return
Sequence
<
i
.
value
>
{};
},
Number
<
Tuple
<
ShapeDims
...
>::
Size
()
>
{});
return
transform_tensor_descriptor
(
desc
,
transforms
,
lower_dims
,
upper_dims
);
}
template
<
typename
...
ShapeDims
,
typename
...
IdxDims
>
__host__
__device__
constexpr
auto
TransformDesc
(
const
Tuple
<
ShapeDims
...
>&
shape
,
const
Tuple
<
IdxDims
...
>&
idx
)
const
{
if
constexpr
(
Tuple
<
IdxDims
...
>::
Size
()
==
I1
)
{
// 1d idx path
return
MakeMerge1d
(
shape
,
descriptor_
);
}
else
{
// Merge nested shape dims
// Example idx: (1, 1), 1, 1
// Example shape: (2, (2, 2)), 2, (2, 2)
// Merged shape: (2, 4), 2, 4
static_assert
(
Tuple
<
ShapeDims
...
>::
Size
()
==
Tuple
<
IdxDims
...
>::
Size
(),
"Idx rank and Shape rank must be the same (except 1d)."
);
// Unroll while IdxDims is nested
const
auto
unrolled_shape_via_idx
=
UnrollShapeViaIdx
(
shape
,
idx
);
// Transform correct form of shape
return
MakeMerges
(
unrolled_shape_via_idx
,
UnrollNestedTuple
(
idx
),
descriptor_
);
}
}
template
<
typename
LayoutShape
,
typename
LayoutStrides
>
__host__
__device__
static
auto
MakeNaiveDescriptor
(
const
LayoutShape
&
shape
,
const
LayoutStrides
&
strides
)
{
const
auto
unrolled_shape
=
UnrollNestedTuple
(
shape
);
if
constexpr
(
ck
::
is_same_v
<
LayoutStrides
,
Tuple
<>>
)
{
// If shape is packed
const
auto
column_major_packed_strides
=
GenerateColumnMajorPackedStrides
(
unrolled_shape
);
return
make_naive_tensor_descriptor
(
unrolled_shape
,
column_major_packed_strides
);
}
else
{
const
auto
unrolled_strides
=
UnrollNestedTuple
(
strides
);
static_assert
(
unrolled_shape
.
Size
()
==
unrolled_strides
.
Size
(),
"Size of strides and shape are not consistent."
);
return
make_naive_tensor_descriptor
(
unrolled_shape
,
unrolled_strides
);
}
}
public:
using
NaiveDescriptorType
=
remove_cvref_t
<
decltype
(
MakeNaiveDescriptor
(
Shape
{},
Strides
{}))
>
;
/**
* \brief Layout constructor.
*
* \param shape Shape for layout.
* \param strides Strides for layout (optional if tensor is packed).
* \return Layout object.
*/
__host__
__device__
Layout
()
=
delete
;
__host__
__device__
Layout
(
const
Shape
&
shape
,
const
Strides
&
strides
)
:
descriptor_
{}
{
// Construct if runtime mode
if
constexpr
(
!
NaiveDescriptorType
::
IsKnownAtCompileTime
())
{
// Keep only shape, strides are not need for transforms
shape_
=
shape
;
descriptor_
=
MakeNaiveDescriptor
(
shape
,
strides
);
}
}
__host__
__device__
Layout
(
const
Shape
&
shape
)
:
descriptor_
{}
{
if
constexpr
(
!
NaiveDescriptorType
::
IsKnownAtCompileTime
())
{
shape_
=
shape
;
descriptor_
=
MakeNaiveDescriptor
(
shape
,
Strides
{});
}
}
/**
* \brief Returns real offset to element in runtime.
*
* \tparam Idxs Tuple of indexes.
* \return Calculated offset.
*/
template
<
typename
Idxs
>
__host__
__device__
constexpr
index_t
operator
()()
const
{
using
TransformedDesc
=
decltype
(
TransformDesc
(
Shape
{},
Idxs
{}));
using
UnrolledIdx
=
decltype
(
UnrollNestedTuple
(
Idxs
{}));
return
TransformedDesc
{}.
CalculateOffset
(
UnrolledIdx
{});
}
/**
* \brief Returns real offset to element in compile time.
*
* \param Idx Tuple of indexes.
* \return Calculated offset.
*/
template
<
typename
...
Ts
>
__host__
__device__
index_t
operator
()(
const
Tuple
<
Ts
...
>&
Idx
)
const
{
// Static to construct transformed_desc only once
static
const
auto
transformed_desc
=
TransformDesc
(
shape_
,
Idx
);
return
transformed_desc
.
CalculateOffset
(
UnrollNestedTuple
(
Idx
));
}
/**
* \brief Length getter (product if tuple).
*
* \tparam IDim Tuple of indexes or index.
* \return Calculated size.
*/
template
<
index_t
IDim
>
__host__
__device__
constexpr
index_t
GetLength
()
const
{
const
auto
elem
=
shape_
.
At
(
Number
<
IDim
>
{});
if
constexpr
(
is_detected
<
is_tuple
,
tuple_element_t
<
IDim
,
Shape
>>::
value
)
{
const
auto
unrolled_element
=
UnrollNestedTuple
(
elem
);
return
TupleReduce
<
I0
.
value
,
unrolled_element
.
Size
()
>
(
[](
auto
x
,
auto
y
)
{
return
x
*
y
;
},
unrolled_element
);
}
else
{
return
elem
;
}
}
/**
* \brief Layout size getter (product of shape).
*
* \return Calculated size.
*/
__host__
__device__
constexpr
index_t
GetLength
()
const
{
const
auto
unrolled_shape
=
UnrollNestedTuple
(
shape_
);
return
TupleReduce
<
I0
.
value
,
unrolled_shape
.
Size
()
>
([](
auto
x
,
auto
y
)
{
return
x
*
y
;
},
unrolled_shape
);
}
/**
* \brief Dimension getter.
*
* \tparam IDim Dimension idx.
* \return Calculated size.
*/
template
<
index_t
IDim
>
__host__
__device__
constexpr
auto
Get
()
const
{
const
auto
elem
=
shape_
.
At
(
Number
<
IDim
>
{});
return
elem
;
}
private:
NaiveDescriptorType
descriptor_
;
Shape
shape_
;
};
// Layout helpers
// Length getter (product if tuple)
template
<
index_t
idx
,
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
index_t
size
(
const
Layout
<
Shape
,
Strides
>&
layout
)
{
return
layout
.
template
GetLength
<
idx
>();
}
// Get shape size (product of dims if tuple)
template
<
typename
...
ShapeDims
>
__host__
__device__
constexpr
index_t
size
(
const
Tuple
<
ShapeDims
...
>&
shape
)
{
using
UnrolledShape
=
decltype
(
UnrollNestedTuple
(
shape
));
return
TupleReduce
<
0
,
UnrolledShape
::
Size
()
>
([](
auto
x
,
auto
y
)
{
return
x
*
y
;
},
UnrolledShape
{});
}
// Get dim size (could be returned from get function)
template
<
typename
T
>
__host__
__device__
T
constexpr
size
(
const
T
&
dim
)
{
return
dim
;
}
// Get layout size (product of shapes)
template
<
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
index_t
size
(
const
Layout
<
Shape
,
Strides
>&
layout
)
{
return
layout
.
GetLength
();
}
// Get shape element size
template
<
index_t
idx
,
typename
...
ShapeDims
>
__host__
__device__
constexpr
index_t
size
(
const
Tuple
<
ShapeDims
...
>&
shape
)
{
return
size
(
shape
.
At
(
Number
<
idx
>
{}));
}
// Dim getter (tuple if tuple)
template
<
index_t
idx
,
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
auto
get
(
const
Layout
<
Shape
,
Strides
>&
layout
)
{
return
layout
.
template
Get
<
idx
>();
}
template
<
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
Layout
<
Shape
,
Strides
>
make_layout
(
const
Shape
&
shape
,
const
Strides
&
strides
)
{
return
Layout
<
Shape
,
Strides
>
(
shape
,
strides
);
}
template
<
typename
Shape
>
__host__
__device__
constexpr
Layout
<
Shape
>
make_layout
(
const
Shape
&
shape
)
{
return
Layout
<
Shape
>
(
shape
);
}
}
// namespace tensor_transform_wrapper
}
// namespace ck
include/ck/ck.hpp
View file @
bc5b84b1
...
@@ -134,6 +134,9 @@
...
@@ -134,6 +134,9 @@
// inner product using V_DOT with DPP8 modifiers
// inner product using V_DOT with DPP8 modifiers
#define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1
#define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1
// LDS direct loads using inline assembly
#define CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM 1
// set stochastic rounding as default for f8 conversions
// set stochastic rounding as default for f8 conversions
#define CK_USE_SR_F8_CONVERSION 1
#define CK_USE_SR_F8_CONVERSION 1
...
...
include/ck/stream_config.hpp
View file @
bc5b84b1
...
@@ -11,6 +11,6 @@ struct StreamConfig
...
@@ -11,6 +11,6 @@ struct StreamConfig
hipStream_t
stream_id_
=
nullptr
;
hipStream_t
stream_id_
=
nullptr
;
bool
time_kernel_
=
false
;
bool
time_kernel_
=
false
;
int
log_level_
=
0
;
int
log_level_
=
0
;
int
cold_niters_
=
50
;
int
cold_niters_
=
1
;
int
nrepeat_
=
20
0
;
int
nrepeat_
=
1
0
;
};
};
include/ck/tensor_operation/gpu/device/device_base.hpp
View file @
bc5b84b1
...
@@ -59,7 +59,9 @@ struct BaseOperator
...
@@ -59,7 +59,9 @@ struct BaseOperator
virtual
size_t
GetWorkSpaceSize
(
const
BaseArgument
*
)
const
{
return
0
;
}
virtual
size_t
GetWorkSpaceSize
(
const
BaseArgument
*
)
const
{
return
0
;
}
virtual
void
SetWorkSpacePointer
(
BaseArgument
*
p_arg
,
void
*
p_workspace
)
const
virtual
void
SetWorkSpacePointer
(
BaseArgument
*
p_arg
,
void
*
p_workspace
,
const
StreamConfig
&
=
StreamConfig
{})
const
{
{
assert
(
p_arg
);
assert
(
p_arg
);
p_arg
->
p_workspace_
=
p_workspace
;
p_arg
->
p_workspace_
=
p_workspace
;
...
...
include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp
View file @
bc5b84b1
...
@@ -376,7 +376,9 @@ struct DeviceBatchNormBwdImpl : public DeviceBatchNormBwd<XDataType,
...
@@ -376,7 +376,9 @@ struct DeviceBatchNormBwdImpl : public DeviceBatchNormBwd<XDataType,
return
(
workspace_size
);
return
(
workspace_size
);
};
};
void
SetWorkSpacePointer
(
BaseArgument
*
pArg
,
void
*
p_workspace
)
const
override
void
SetWorkSpacePointer
(
BaseArgument
*
pArg
,
void
*
p_workspace
,
const
StreamConfig
&
=
StreamConfig
{})
const
override
{
{
Argument
*
pArg_
=
dynamic_cast
<
Argument
*>
(
pArg
);
Argument
*
pArg_
=
dynamic_cast
<
Argument
*>
(
pArg
);
...
...
include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
View file @
bc5b84b1
...
@@ -354,7 +354,9 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
...
@@ -354,7 +354,9 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
return
(
workspace_size
);
return
(
workspace_size
);
};
};
void
SetWorkSpacePointer
(
BaseArgument
*
pArg
,
void
*
p_workspace
)
const
override
void
SetWorkSpacePointer
(
BaseArgument
*
pArg
,
void
*
p_workspace
,
const
StreamConfig
&
=
StreamConfig
{})
const
override
{
{
Argument
*
pArg_
=
dynamic_cast
<
Argument
*>
(
pArg
);
Argument
*
pArg_
=
dynamic_cast
<
Argument
*>
(
pArg
);
...
...
include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl_obsolete.hpp
View file @
bc5b84b1
...
@@ -345,7 +345,9 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
...
@@ -345,7 +345,9 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
return
(
workspace_size
);
return
(
workspace_size
);
};
};
void
SetWorkSpacePointer
(
BaseArgument
*
pArg
,
void
*
p_workspace
)
const
override
void
SetWorkSpacePointer
(
BaseArgument
*
pArg
,
void
*
p_workspace
,
const
StreamConfig
&
=
StreamConfig
{})
const
override
{
{
Argument
*
pArg_
=
dynamic_cast
<
Argument
*>
(
pArg
);
Argument
*
pArg_
=
dynamic_cast
<
Argument
*>
(
pArg
);
...
...
include/ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp
View file @
bc5b84b1
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/stream_utility.hpp"
#include "ck/host_utility/stream_utility.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -292,6 +293,12 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple,
...
@@ -292,6 +293,12 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple,
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
{
if
((
ck
::
get_device_name
()
==
"gfx940"
||
ck
::
get_device_name
()
==
"gfx941"
||
ck
::
get_device_name
()
==
"gfx942"
))
{
return
false
;
}
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
pArg
==
nullptr
)
if
(
pArg
==
nullptr
)
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
View file @
bc5b84b1
...
@@ -821,7 +821,9 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
...
@@ -821,7 +821,9 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
return
(
workspace_size
);
return
(
workspace_size
);
};
};
void
SetWorkSpacePointer
(
BaseArgument
*
pArg
,
void
*
p_workspace
)
const
override
void
SetWorkSpacePointer
(
BaseArgument
*
pArg
,
void
*
p_workspace
,
const
StreamConfig
&
=
StreamConfig
{})
const
override
{
{
Argument
*
pArg_
=
dynamic_cast
<
Argument
*>
(
pArg
);
Argument
*
pArg_
=
dynamic_cast
<
Argument
*>
(
pArg
);
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_lds_direct_load.hpp
View file @
bc5b84b1
...
@@ -380,7 +380,9 @@ struct DeviceGemm_Xdl_CShuffle_LdsDirectLoad : public DeviceGemm<ALayout,
...
@@ -380,7 +380,9 @@ struct DeviceGemm_Xdl_CShuffle_LdsDirectLoad : public DeviceGemm<ALayout,
<<
" LoopScheduler: "
<<
" LoopScheduler: "
<<
LoopSchedToString
[
LoopSched
]
<<
", "
<<
LoopSchedToString
[
LoopSched
]
<<
", "
<<
"PipelineVersion: "
<<
"PipelineVersion: "
<<
PipelineVersionToString
[
PipelineVer
];
<<
PipelineVersionToString
[
PipelineVer
]
<<
", "
<<
"Prefetch: "
<<
NumGemmKPrefetchStage
;
// clang-format on
// clang-format on
return
str
.
str
();
return
str
.
str
();
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
View file @
bc5b84b1
...
@@ -226,7 +226,9 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
...
@@ -226,7 +226,9 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
}
}
}
}
void
SetWorkSpacePointer
(
BaseArgument
*
pArg
,
void
*
p_workspace
)
const
override
void
SetWorkSpacePointer
(
BaseArgument
*
pArg
,
void
*
p_workspace
,
const
StreamConfig
&
=
StreamConfig
{})
const
override
{
{
Argument
*
pArg_
=
dynamic_cast
<
Argument
*>
(
pArg
);
Argument
*
pArg_
=
dynamic_cast
<
Argument
*>
(
pArg
);
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
View file @
bc5b84b1
...
@@ -817,12 +817,15 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
...
@@ -817,12 +817,15 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
return
arg
.
group_count_
*
sizeof
(
GroupedGemmKernelArgument
<
NumDTensor
>
);
return
arg
.
group_count_
*
sizeof
(
GroupedGemmKernelArgument
<
NumDTensor
>
);
}
}
void
SetWorkSpacePointer
(
BaseArgument
*
p_arg
,
void
*
p_workspace
)
const
override
void
SetWorkSpacePointer
(
BaseArgument
*
p_arg
,
void
*
p_workspace
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
const
override
{
{
auto
p_arg_
=
dynamic_cast
<
Argument
*>
(
p_arg
);
auto
p_arg_
=
dynamic_cast
<
Argument
*>
(
p_arg
);
p_arg_
->
p_workspace_
=
p_workspace
;
p_arg_
->
p_workspace_
=
p_workspace
;
hip_check_error
(
hipMemset
(
p_workspace
,
0
,
GetWorkSpaceSize
(
p_arg
)));
hip_check_error
(
hipMemsetAsync
(
p_workspace
,
0
,
GetWorkSpaceSize
(
p_arg
),
stream_config
.
stream_id_
));
}
}
static
void
SetKBatch
(
Argument
&
arg
,
index_t
k_batch
)
{
arg
.
UpdateKBatch
(
k_batch
);
}
static
void
SetKBatch
(
Argument
&
arg
,
index_t
k_batch
)
{
arg
.
UpdateKBatch
(
k_batch
);
}
...
...
include/ck/tensor_operation/gpu/device/impl/device_normalization_fwd_splitk_impl.hpp
View file @
bc5b84b1
...
@@ -577,7 +577,9 @@ struct DeviceNormalizationFwdSplitKImpl : public DeviceNormalizationFwd<XDataTyp
...
@@ -577,7 +577,9 @@ struct DeviceNormalizationFwdSplitKImpl : public DeviceNormalizationFwd<XDataTyp
return
(
workspace_size
);
return
(
workspace_size
);
};
};
void
SetWorkSpacePointer
(
BaseArgument
*
pArg
,
void
*
p_workspace
)
const
override
void
SetWorkSpacePointer
(
BaseArgument
*
pArg
,
void
*
p_workspace
,
const
StreamConfig
&
=
StreamConfig
{})
const
override
{
{
Argument
*
pArg_
=
dynamic_cast
<
Argument
*>
(
pArg
);
Argument
*
pArg_
=
dynamic_cast
<
Argument
*>
(
pArg
);
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
View file @
bc5b84b1
...
@@ -236,9 +236,10 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
...
@@ -236,9 +236,10 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
constexpr
auto
c_block_size
=
constexpr
auto
c_block_size
=
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
();
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
();
return
math
::
max
(
a_block_space_size_aligned
*
sizeof
(
AComputeDataType
)
+
return
math
::
max
(
b_block_space_size_aligned
*
sizeof
(
BComputeDataType
),
NumGemmKPrefetchStage
*
a_block_space_size_aligned
*
sizeof
(
AComputeDataType
)
+
c_block_size
*
sizeof
(
CShuffleDataType
));
NumGemmKPrefetchStage
*
b_block_space_size_aligned
*
sizeof
(
BComputeDataType
),
c_block_size
*
sizeof
(
CShuffleDataType
));
}
}
__host__
__device__
static
auto
__host__
__device__
static
auto
...
@@ -491,6 +492,22 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
...
@@ -491,6 +492,22 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
__device__
__host__
static
constexpr
auto
GetMPerBlock
()
{
return
MPerBlock
;
}
__device__
__host__
static
constexpr
auto
GetMPerBlock
()
{
return
MPerBlock
;
}
template
<
typename
DataType
>
__device__
static
auto
AllocateBlockBuffers
(
void
*
p_shared
,
int32_t
num_elems
,
int32_t
offset_elems
,
int32_t
max_lds_align
)
{
const
int32_t
single_buffer_offset
=
math
::
integer_least_multiple
(
num_elems
,
max_lds_align
);
return
generate_tuple
(
[
&
](
auto
i
)
{
const
int32_t
local_offset
=
i
*
single_buffer_offset
;
return
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
DataType
*>
(
p_shared
)
+
local_offset
+
offset_elems
,
num_elems
);
},
Number
<
NumGemmKPrefetchStage
>
{});
}
template
<
bool
HasMainKBlockLoop
,
template
<
bool
HasMainKBlockLoop
,
typename
AGridDesc_AK0_M_AK1
,
typename
AGridDesc_AK0_M_AK1
,
typename
BGridDesc_BK0_N_BK1
,
typename
BGridDesc_BK0_N_BK1
,
...
@@ -624,12 +641,14 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
...
@@ -624,12 +641,14 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
constexpr
auto
a_block_space_size_aligned
=
math
::
integer_least_multiple
(
constexpr
auto
a_block_space_size_aligned
=
math
::
integer_least_multiple
(
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
(),
max_lds_align
);
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
(),
max_lds_align
);
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
auto
a_block_buffers
=
AllocateBlockBuffers
<
AComputeDataType
>
(
static_cast
<
AComputeDataType
*>
(
p_shared
),
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
());
p_shared
,
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
(),
0
,
max_lds_align
);
const
auto
b_buffers_offset
=
a_block_space_size_aligned
*
NumGemmKPrefetchStage
;
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
auto
b_block_buffers
=
static_cast
<
BComputeDataType
*>
(
p_shared
)
+
a_block_space_size_aligned
,
AllocateBlockBuffers
<
BComputeDataType
>
(
p_shared
,
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
());
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
(),
b_buffers_offset
,
max_lds_align
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
KPerBlock
/
AK1
,
0
,
0
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
KPerBlock
/
AK1
,
0
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
KPerBlock
/
BK1
,
0
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
KPerBlock
/
BK1
,
0
,
0
);
...
@@ -645,13 +664,13 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
...
@@ -645,13 +664,13 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
a_block_desc_ak0_m_ak1
,
a_block_desc_ak0_m_ak1
,
a_blockwise_copy
,
a_blockwise_copy
,
a_grid_buf
,
a_grid_buf
,
a_block_buf
,
a_block_buf
fers
,
a_block_slice_copy_step
,
a_block_slice_copy_step
,
b_grid_desc_bk0_n_bk1
,
b_grid_desc_bk0_n_bk1
,
b_block_desc_bk0_n_bk1
,
b_block_desc_bk0_n_bk1
,
b_blockwise_copy
,
b_blockwise_copy
,
b_grid_buf
,
b_grid_buf
,
b_block_buf
,
b_block_buf
fers
,
b_block_slice_copy_step
,
b_block_slice_copy_step
,
blockwise_gemm
,
blockwise_gemm
,
c_thread_buf
,
c_thread_buf
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v4_direct_load.hpp
View file @
bc5b84b1
...
@@ -7,6 +7,20 @@
...
@@ -7,6 +7,20 @@
#include "ck/utility/loop_scheduler.hpp"
#include "ck/utility/loop_scheduler.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
namespace
lds_direct_load
{
__device__
void
sched_barrier
()
{
#if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
// When direct loads and `waitcnt` instructions are submitted using inline asm, the usage of
// `sched_barrier` is necessary to make sure no instructions that use the loaded memory
// are scheduled by the compiler before the `waitcnt` instruction.
__builtin_amdgcn_sched_barrier
(
0
);
#endif
}
}
// namespace lds_direct_load
namespace
ck
{
namespace
ck
{
template
<
index_t
NumPrefetch
>
template
<
index_t
NumPrefetch
>
...
@@ -17,7 +31,6 @@ template <>
...
@@ -17,7 +31,6 @@ template <>
struct
GridwiseGemmPipeline_v4
<
1
>
struct
GridwiseGemmPipeline_v4
<
1
>
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
__host__
__device__
static
constexpr
bool
IsSupported
(
index_t
/* num_loop */
)
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsSupported
(
index_t
/* num_loop */
)
{
return
true
;
}
...
@@ -31,13 +44,13 @@ struct GridwiseGemmPipeline_v4<1>
...
@@ -31,13 +44,13 @@ struct GridwiseGemmPipeline_v4<1>
typename
ABlockDesc
,
typename
ABlockDesc
,
typename
ABlockTransfer
,
typename
ABlockTransfer
,
typename
AGridBuffer
,
typename
AGridBuffer
,
typename
ABlockBuffer
,
typename
ABlockBuffer
s
,
typename
ABlockTransferStep
,
typename
ABlockTransferStep
,
typename
BGridDesc
,
typename
BGridDesc
,
typename
BBlockDesc
,
typename
BBlockDesc
,
typename
BBlockTransfer
,
typename
BBlockTransfer
,
typename
BGridBuffer
,
typename
BGridBuffer
,
typename
BBlockBuffer
,
typename
BBlockBuffer
s
,
typename
BBlockTransferStep
,
typename
BBlockTransferStep
,
typename
BlockwiseGemm
,
typename
BlockwiseGemm
,
typename
CThreadBuffer
>
typename
CThreadBuffer
>
...
@@ -45,18 +58,22 @@ struct GridwiseGemmPipeline_v4<1>
...
@@ -45,18 +58,22 @@ struct GridwiseGemmPipeline_v4<1>
const
ABlockDesc
&
a_block_desc
,
const
ABlockDesc
&
a_block_desc
,
ABlockTransfer
&
a_blockwise_copy
,
ABlockTransfer
&
a_blockwise_copy
,
const
AGridBuffer
&
a_grid_buf
,
const
AGridBuffer
&
a_grid_buf
,
ABlockBuffer
&
a_block_buf
,
ABlockBuffer
s
&
a_block_buf
s
,
const
ABlockTransferStep
&
a_block_copy_step
,
const
ABlockTransferStep
&
a_block_copy_step
,
const
BGridDesc
&
b_grid_desc
,
const
BGridDesc
&
b_grid_desc
,
const
BBlockDesc
&
b_block_desc
,
const
BBlockDesc
&
b_block_desc
,
BBlockTransfer
&
b_blockwise_copy
,
BBlockTransfer
&
b_blockwise_copy
,
const
BGridBuffer
&
b_grid_buf
,
const
BGridBuffer
&
b_grid_buf
,
BBlockBuffer
&
b_block_buf
,
BBlockBuffer
s
&
b_block_buf
s
,
const
BBlockTransferStep
&
b_block_copy_step
,
const
BBlockTransferStep
&
b_block_copy_step
,
const
BlockwiseGemm
&
blockwise_gemm
,
const
BlockwiseGemm
&
blockwise_gemm
,
CThreadBuffer
&
c_thread_buf
,
CThreadBuffer
&
c_thread_buf
,
index_t
num_loop
)
index_t
num_loop
)
{
{
static_assert
(
ABlockBuffers
::
Size
()
==
1
&&
BBlockBuffers
::
Size
()
==
1
);
auto
&
a_block_buf
=
a_block_bufs
.
At
(
I0
);
auto
&
b_block_buf
=
b_block_bufs
.
At
(
I0
);
a_blockwise_copy
.
Run
(
a_grid_desc
,
a_grid_buf
,
a_block_desc
,
a_block_buf
);
a_blockwise_copy
.
Run
(
a_grid_desc
,
a_grid_buf
,
a_block_desc
,
a_block_buf
);
b_blockwise_copy
.
Run
(
b_grid_desc
,
b_grid_buf
,
b_block_desc
,
b_block_buf
);
b_blockwise_copy
.
Run
(
b_grid_desc
,
b_grid_buf
,
b_block_desc
,
b_block_buf
);
...
@@ -74,10 +91,12 @@ struct GridwiseGemmPipeline_v4<1>
...
@@ -74,10 +91,12 @@ struct GridwiseGemmPipeline_v4<1>
do
do
{
{
block_sync_lds_direct_load
();
block_sync_lds_direct_load
();
lds_direct_load
::
sched_barrier
();
blockwise_gemm
.
Run
(
a_block_buf
,
b_block_buf
,
c_thread_buf
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_block_buf
,
c_thread_buf
);
block_sync_lds_direct_load
();
block_sync_lds_direct_load
();
lds_direct_load
::
sched_barrier
();
a_blockwise_copy
.
Run
(
a_grid_desc
,
a_grid_buf
,
a_block_desc
,
a_block_buf
);
a_blockwise_copy
.
Run
(
a_grid_desc
,
a_grid_buf
,
a_block_desc
,
a_block_buf
);
b_blockwise_copy
.
Run
(
b_grid_desc
,
b_grid_buf
,
b_block_desc
,
b_block_buf
);
b_blockwise_copy
.
Run
(
b_grid_desc
,
b_grid_buf
,
b_block_desc
,
b_block_buf
);
...
@@ -92,10 +111,128 @@ struct GridwiseGemmPipeline_v4<1>
...
@@ -92,10 +111,128 @@ struct GridwiseGemmPipeline_v4<1>
// tail
// tail
{
{
block_sync_lds_direct_load
();
block_sync_lds_direct_load
();
lds_direct_load
::
sched_barrier
();
blockwise_gemm
.
Run
(
a_block_buf
,
b_block_buf
,
c_thread_buf
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_block_buf
,
c_thread_buf
);
}
}
}
}
};
};
// 2-stages prefetch
template
<
>
struct
GridwiseGemmPipeline_v4
<
2
>
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
__host__
__device__
static
constexpr
bool
IsSupported
(
index_t
num_loop
)
{
return
num_loop
%
2
==
0
;
}
__host__
__device__
static
constexpr
bool
CalculateHasMainLoop
(
index_t
num_loop
)
{
return
(
num_loop
/
2
)
>
1
;
}
template
<
bool
HasMainLoop
,
typename
AGridDesc
,
typename
ABlockDesc
,
typename
ABlockTransfer
,
typename
AGridBuffer
,
typename
ABlockBuffers
,
typename
ABlockTransferStep
,
typename
BGridDesc
,
typename
BBlockDesc
,
typename
BBlockTransfer
,
typename
BGridBuffer
,
typename
BBlockBuffers
,
typename
BBlockTransferStep
,
typename
BlockwiseGemm
,
typename
CThreadBuffer
>
__device__
static
void
Run
(
const
AGridDesc
&
a_grid_desc
,
const
ABlockDesc
&
a_block_desc
,
ABlockTransfer
&
a_blockwise_copy
,
const
AGridBuffer
&
a_grid_buf
,
ABlockBuffers
&
a_block_bufs
,
const
ABlockTransferStep
&
a_block_copy_step
,
const
BGridDesc
&
b_grid_desc
,
const
BBlockDesc
&
b_block_desc
,
BBlockTransfer
&
b_blockwise_copy
,
const
BGridBuffer
&
b_grid_buf
,
BBlockBuffers
&
b_block_bufs
,
const
BBlockTransferStep
&
b_block_copy_step
,
const
BlockwiseGemm
&
blockwise_gemm
,
CThreadBuffer
&
c_thread_buf
,
index_t
num_loop
)
{
static_assert
(
ABlockBuffers
::
Size
()
==
2
&&
BBlockBuffers
::
Size
()
==
2
);
auto
&
a_block_buf1
=
a_block_bufs
.
At
(
I0
);
auto
&
a_block_buf2
=
a_block_bufs
.
At
(
I1
);
auto
&
b_block_buf1
=
b_block_bufs
.
At
(
I0
);
auto
&
b_block_buf2
=
b_block_bufs
.
At
(
I1
);
a_blockwise_copy
.
Run
(
a_grid_desc
,
a_grid_buf
,
a_block_desc
,
a_block_buf1
);
b_blockwise_copy
.
Run
(
b_grid_desc
,
b_grid_buf
,
b_block_desc
,
b_block_buf1
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc
,
a_block_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc
,
b_block_copy_step
);
// Initialize C
c_thread_buf
.
Clear
();
// main body
if
constexpr
(
HasMainLoop
)
{
index_t
i
=
0
;
do
{
block_sync_lds_direct_load
();
lds_direct_load
::
sched_barrier
();
a_blockwise_copy
.
Run
(
a_grid_desc
,
a_grid_buf
,
a_block_desc
,
a_block_buf2
);
b_blockwise_copy
.
Run
(
b_grid_desc
,
b_grid_buf
,
b_block_desc
,
b_block_buf2
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc
,
a_block_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc
,
b_block_copy_step
);
blockwise_gemm
.
Run
(
a_block_buf1
,
b_block_buf1
,
c_thread_buf
);
block_sync_lds_direct_load
();
lds_direct_load
::
sched_barrier
();
a_blockwise_copy
.
Run
(
a_grid_desc
,
a_grid_buf
,
a_block_desc
,
a_block_buf1
);
b_blockwise_copy
.
Run
(
b_grid_desc
,
b_grid_buf
,
b_block_desc
,
b_block_buf1
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc
,
a_block_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc
,
b_block_copy_step
);
blockwise_gemm
.
Run
(
a_block_buf2
,
b_block_buf2
,
c_thread_buf
);
i
+=
2
;
}
while
(
i
<
(
num_loop
-
2
));
}
// tail
{
block_sync_lds_direct_load
();
lds_direct_load
::
sched_barrier
();
a_blockwise_copy
.
Run
(
a_grid_desc
,
a_grid_buf
,
a_block_desc
,
a_block_buf2
);
b_blockwise_copy
.
Run
(
b_grid_desc
,
b_grid_buf
,
b_block_desc
,
b_block_buf2
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc
,
a_block_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc
,
b_block_copy_step
);
blockwise_gemm
.
Run
(
a_block_buf1
,
b_block_buf1
,
c_thread_buf
);
block_sync_lds_direct_load
();
lds_direct_load
::
sched_barrier
();
blockwise_gemm
.
Run
(
a_block_buf2
,
b_block_buf2
,
c_thread_buf
);
}
}
};
}
// namespace ck
}
// namespace ck
include/ck/utility/amd_buffer_addressing.hpp
View file @
bc5b84b1
...
@@ -972,6 +972,15 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
...
@@ -972,6 +972,15 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
const
int32x4_t
src_resource
=
make_wave_buffer_resource
(
global_ptr
,
src_element_space_size
);
const
int32x4_t
src_resource
=
make_wave_buffer_resource
(
global_ptr
,
src_element_space_size
);
const
index_t
global_offset_bytes
=
is_valid
?
global_offset
*
sizeof
(
T
)
:
0x80000000
;
const
index_t
global_offset_bytes
=
is_valid
?
global_offset
*
sizeof
(
T
)
:
0x80000000
;
#if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
T
*
lds_ptr
=
lds_base_ptr
+
lds_offset
;
auto
const
lds_ptr_sgpr
=
__builtin_amdgcn_readfirstlane
((
reinterpret_cast
<
uintptr_t
>
(
lds_ptr
)));
asm
volatile
(
"s_mov_b32 m0, %0;
\n\t
"
"buffer_load_dword %1, %2, 0 offen lds;
\n\t
"
::
"s"
(
lds_ptr_sgpr
),
"v"
(
global_offset_bytes
),
"s"
(
src_resource
));
#else
// LDS pointer must be attributed with the LDS address space.
// LDS pointer must be attributed with the LDS address space.
__attribute__
((
address_space
(
3
)))
uint32_t
*
lds_ptr
=
__attribute__
((
address_space
(
3
)))
uint32_t
*
lds_ptr
=
reinterpret_cast
<
__attribute__
((
address_space
(
3
)))
uint32_t
*>
(
reinterpret_cast
<
__attribute__
((
address_space
(
3
)))
uint32_t
*>
(
...
@@ -979,6 +988,7 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
...
@@ -979,6 +988,7 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
llvm_amdgcn_raw_buffer_load_lds
(
llvm_amdgcn_raw_buffer_load_lds
(
src_resource
,
lds_ptr
,
sizeof
(
uint32_t
),
global_offset_bytes
,
0
,
0
,
0
);
src_resource
,
lds_ptr
,
sizeof
(
uint32_t
),
global_offset_bytes
,
0
,
0
,
0
);
#endif
}
}
}
// namespace ck
}
// namespace ck
include/ck/utility/tuple_helper.hpp
View file @
bc5b84b1
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
#include "functional4.hpp"
#include "functional4.hpp"
#include "tuple.hpp"
#include "tuple.hpp"
#include "is_detected.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -33,6 +34,28 @@ __host__ __device__ constexpr auto concat_tuple_of_reference(const Tuple<X&...>&
...
@@ -33,6 +34,28 @@ __host__ __device__ constexpr auto concat_tuple_of_reference(const Tuple<X&...>&
ty
);
ty
);
}
}
template
<
typename
...
X
,
typename
...
Y
>
__host__
__device__
constexpr
auto
concat_tuple
(
const
Tuple
<
X
...
>&
tx
,
const
Tuple
<
Y
...
>&
ty
)
{
return
unpack2
(
[
&
](
auto
...
zs
)
{
return
Tuple
<
decltype
(
zs
)...
>
{
std
::
forward
<
decltype
(
zs
)
>
(
zs
)...};
},
tx
,
ty
);
}
// Support any number of tuples to concat (also 1)
template
<
typename
...
X
>
__host__
__device__
constexpr
auto
concat_tuple
(
const
Tuple
<
X
...
>&
tx
)
{
return
tx
;
}
template
<
typename
...
X
,
typename
...
Tuples
>
__host__
__device__
constexpr
auto
concat_tuple
(
const
Tuple
<
X
...
>&
tx
,
const
Tuples
&
...
tuples
)
{
return
concat_tuple
(
tx
,
concat_tuple
(
tuples
...));
}
namespace
detail
{
namespace
detail
{
template
<
typename
F
,
typename
X
,
index_t
...
Is
>
template
<
typename
F
,
typename
X
,
index_t
...
Is
>
...
@@ -78,4 +101,69 @@ __host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y,
...
@@ -78,4 +101,69 @@ __host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y,
f
,
x
,
y
,
z
,
typename
arithmetic_sequence_gen
<
0
,
X
::
Size
(),
1
>::
type
{});
f
,
x
,
y
,
z
,
typename
arithmetic_sequence_gen
<
0
,
X
::
Size
(),
1
>::
type
{});
}
}
// By default unroll to the flatten
template
<
index_t
Depth
=
0
,
index_t
MaxDepth
=
-
1
>
__host__
__device__
constexpr
auto
UnrollNestedTuple
(
const
Tuple
<>&
element
)
{
return
element
;
}
template
<
index_t
Depth
=
0
,
index_t
MaxDepth
=
-
1
,
typename
T
>
__host__
__device__
constexpr
auto
UnrollNestedTuple
(
const
T
&
element
)
{
return
make_tuple
(
element
);
}
template
<
index_t
Depth
=
0
,
index_t
MaxDepth
=
-
1
,
typename
...
Ts
>
__host__
__device__
constexpr
auto
UnrollNestedTuple
(
const
Tuple
<
Ts
...
>&
tuple
)
{
if
constexpr
(
Depth
==
MaxDepth
)
{
return
tuple
;
}
else
{
return
unpack
(
[
&
](
auto
&&
...
ts
)
{
return
concat_tuple
(
UnrollNestedTuple
<
Depth
+
1
,
MaxDepth
>
(
ts
)...);
},
tuple
);
}
}
template
<
typename
...
Ts
>
__host__
__device__
constexpr
auto
TupleReverse
(
const
Tuple
<
Ts
...
>&
tuple
)
{
return
generate_tuple
(
[
&
](
auto
i
)
{
using
Idx
=
Number
<
Tuple
<
Ts
...
>::
Size
()
-
i
-
1
>
;
return
tuple
.
At
(
Idx
{});
},
Number
<
Tuple
<
Ts
...
>::
Size
()
>
{});
}
// Reduce tuple values in specific range using Function
template
<
index_t
Idx
,
index_t
End
,
typename
F
,
typename
...
Ts
>
__host__
__device__
constexpr
auto
TupleReduce
(
F
&&
f
,
const
Tuple
<
Ts
...
>&
tuple
)
{
static_assert
(
Idx
<
End
,
"Wrong parameters for TupleReduce"
);
if
constexpr
(
Idx
+
1
==
End
)
{
return
tuple
.
At
(
Number
<
Idx
>
{});
}
else
{
return
f
(
tuple
.
At
(
Number
<
Idx
>
{}),
TupleReduce
<
Idx
+
1
,
End
>
(
f
,
tuple
));
}
}
template
<
typename
T
>
using
is_tuple
=
decltype
(
std
::
declval
<
T
&>
().
IsTuple
());
template
<
typename
...
Ts
>
__host__
__device__
constexpr
auto
IsNestedTuple
(
const
Tuple
<
Ts
...
>&
)
{
return
(
is_detected
<
is_tuple
,
Ts
>::
value
||
...);
}
}
// namespace ck
}
// namespace ck
library/src/tensor_operation_instance/gpu/CMakeLists.txt
View file @
bc5b84b1
...
@@ -61,7 +61,12 @@ endfunction(add_instance_library INSTANCE_NAME)
...
@@ -61,7 +61,12 @@ endfunction(add_instance_library INSTANCE_NAME)
file
(
GLOB dir_list LIST_DIRECTORIES true *
)
file
(
GLOB dir_list LIST_DIRECTORIES true *
)
set
(
CK_DEVICE_INSTANCES
)
set
(
CK_DEVICE_OTHER_INSTANCES
)
set
(
CK_DEVICE_GEMM_INSTANCES
)
set
(
CK_DEVICE_CONV_INSTANCES
)
set
(
CK_DEVICE_MHA_INSTANCES
)
set
(
CK_DEVICE_CONTRACTION_INSTANCES
)
set
(
CK_DEVICE_REDUCTION_INSTANCES
)
FOREACH
(
subdir_path
${
dir_list
}
)
FOREACH
(
subdir_path
${
dir_list
}
)
set
(
target_dir
)
set
(
target_dir
)
IF
(
IS_DIRECTORY
"
${
subdir_path
}
"
)
IF
(
IS_DIRECTORY
"
${
subdir_path
}
"
)
...
@@ -125,7 +130,19 @@ FOREACH(subdir_path ${dir_list})
...
@@ -125,7 +130,19 @@ FOREACH(subdir_path ${dir_list})
if
((
add_inst EQUAL 1
))
if
((
add_inst EQUAL 1
))
get_filename_component
(
target_dir
${
subdir_path
}
NAME
)
get_filename_component
(
target_dir
${
subdir_path
}
NAME
)
add_subdirectory
(
${
target_dir
}
)
add_subdirectory
(
${
target_dir
}
)
list
(
APPEND CK_DEVICE_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
if
(
"
${
cmake_instance
}
"
MATCHES
"gemm"
)
list
(
APPEND CK_DEVICE_GEMM_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
elseif
(
"
${
cmake_instance
}
"
MATCHES
"conv"
)
list
(
APPEND CK_DEVICE_CONV_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
elseif
(
"
${
cmake_instance
}
"
MATCHES
"mha"
)
list
(
APPEND CK_DEVICE_MHA_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
elseif
(
"
${
cmake_instance
}
"
MATCHES
"contr"
)
list
(
APPEND CK_DEVICE_CONTRACTION_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
elseif
(
"
${
cmake_instance
}
"
MATCHES
"reduce"
)
list
(
APPEND CK_DEVICE_REDUCTION_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
else
()
list
(
APPEND CK_DEVICE_OTHER_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
endif
()
message
(
"add_instance_directory
${
subdir_path
}
"
)
message
(
"add_instance_directory
${
subdir_path
}
"
)
else
()
else
()
message
(
"skip_instance_directory
${
subdir_path
}
"
)
message
(
"skip_instance_directory
${
subdir_path
}
"
)
...
@@ -133,51 +150,137 @@ FOREACH(subdir_path ${dir_list})
...
@@ -133,51 +150,137 @@ FOREACH(subdir_path ${dir_list})
ENDIF
()
ENDIF
()
ENDFOREACH
()
ENDFOREACH
()
add_library
(
device_operations STATIC
${
CK_DEVICE_INSTANCES
}
)
add_library
(
composablekernels::device_operations ALIAS device_operations
)
if
(
CK_DEVICE_OTHER_INSTANCES
)
add_library
(
device_other_operations STATIC
${
CK_DEVICE_OTHER_INSTANCES
}
)
add_library
(
composablekernels::device_other_operations ALIAS device_other_operations
)
set_target_properties
(
device_other_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_other_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/utility>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_description>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/problem_transform>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/device>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/device/impl>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/grid>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/block>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/warp>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/thread>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/element>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/utility>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/quantization>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/softmax>
)
rocm_install
(
TARGETS device_other_operations
EXPORT device_other_operationsTargets
)
rocm_install
(
EXPORT device_other_operationsTargets
FILE composable_kerneldevice_other_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_GEMM_INSTANCES
)
add_library
(
device_gemm_operations STATIC
${
CK_DEVICE_GEMM_INSTANCES
}
)
add_library
(
composablekernels::device_gemm_operations ALIAS device_gemm_operations
)
target_compile_features
(
device_gemm_operations PUBLIC
)
set_target_properties
(
device_gemm_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_gemm_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
)
rocm_install
(
TARGETS device_gemm_operations
EXPORT device_gemm_operationsTargets
)
rocm_install
(
EXPORT device_gemm_operationsTargets
FILE composable_kerneldevice_gemm_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_CONV_INSTANCES
)
add_library
(
device_conv_operations STATIC
${
CK_DEVICE_CONV_INSTANCES
}
)
add_library
(
composablekernels::device_conv_operations ALIAS device_conv_operations
)
target_compile_features
(
device_conv_operations PUBLIC
)
set_target_properties
(
device_conv_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_conv_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd>
)
rocm_install
(
TARGETS device_conv_operations
EXPORT device_conv_operationsTargets
)
rocm_install
(
EXPORT device_conv_operationsTargets
FILE composable_kerneldevice_conv_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_MHA_INSTANCES
)
add_library
(
device_mha_operations STATIC
${
CK_DEVICE_MHA_INSTANCES
}
)
add_library
(
composablekernels::device_mha_operations ALIAS device_mha_operations
)
target_compile_features
(
device_mha_operations PUBLIC
)
set_target_properties
(
device_mha_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_mha_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/mha>
)
rocm_install
(
TARGETS device_mha_operations
EXPORT device_mha_operationsTargets
)
rocm_install
(
EXPORT device_mha_operationsTargets
FILE composable_kerneldevice_mha_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_CONTRACTION_INSTANCES
)
add_library
(
device_contraction_operations STATIC
${
CK_DEVICE_CONTRACTION_INSTANCES
}
)
add_library
(
composablekernels::device_contraction_operations ALIAS device_contraction_operations
)
target_compile_features
(
device_contraction_operations PUBLIC
)
set_target_properties
(
device_contraction_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_contraction_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/contraction>
)
rocm_install
(
TARGETS device_contraction_operations
EXPORT device_contraction_operationsTargets
)
rocm_install
(
EXPORT device_contraction_operationsTargets
FILE composable_kerneldevice_contraction_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_REDUCTION_INSTANCES
)
add_library
(
device_reduction_operations STATIC
${
CK_DEVICE_REDUCTION_INSTANCES
}
)
add_library
(
composablekernels::device_reduction_operations ALIAS device_reduction_operations
)
target_compile_features
(
device_reduction_operations PUBLIC
)
set_target_properties
(
device_reduction_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_reduction_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/reduce>
)
rocm_install
(
TARGETS device_reduction_operations
EXPORT device_reduction_operationsTargets
)
rocm_install
(
EXPORT device_reduction_operationsTargets
FILE composable_kerneldevice_reduction_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
add_library
(
device_operations INTERFACE
)
target_link_libraries
(
device_operations INTERFACE
device_contraction_operations
device_conv_operations
device_gemm_operations
device_other_operations
device_reduction_operations
utility
)
set
(
DEV_OPS_INC_DIRS
set
(
DEV_OPS_INC_DIRS
${
PROJECT_SOURCE_DIR
}
/include/ck/
${
PROJECT_SOURCE_DIR
}
/include/ck/
${
PROJECT_SOURCE_DIR
}
/library/include/ck/
${
PROJECT_SOURCE_DIR
}
/library/include/ck/
)
)
set_target_properties
(
device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/utility>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_description>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/problem_transform>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/device>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/device/impl>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/grid>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/block>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/warp>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/thread>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_operation/gpu/element>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/utility>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/reduce>
)
#once new arches are enabled make this an option on the main cmake file
# and pass down here to be exported
target_compile_options
(
device_operations PRIVATE
--offload-arch=gfx908
--offload-arch=gfx90a
--offload-arch=gfx1030
--offload-arch=gfx1100
)
# install(TARGETS device_operations LIBRARY DESTINATION lib)
rocm_install
(
TARGETS device_operations
EXPORT device_operationsTargets
)
rocm_install
(
DIRECTORY
${
DEV_OPS_INC_DIRS
}
DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck
)
rocm_install
(
DIRECTORY
${
DEV_OPS_INC_DIRS
}
DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck
)
rocm_install
(
EXPORT device_operationsTargets
FILE composable_kerneldevice_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f16_f16_f16_mk_nk_mn_instance.cpp
View file @
bc5b84b1
...
@@ -35,7 +35,21 @@ using device_gemm_xdl_c_shuffle_lds_direct_load_f16_f16_f16_mk_nk_mn_instances =
...
@@ -35,7 +35,21 @@ using device_gemm_xdl_c_shuffle_lds_direct_load_f16_f16_f16_mk_nk_mn_instances =
// ##################################| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| | | PerVector| | Lengths_K0_N_K1| | | PerVector| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ##################################| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| | | PerVector| | Lengths_K0_N_K1| | | PerVector| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ##################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// ##################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
1
,
256
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
64
,
32
,
32
,
64
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
1
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
S
<
1
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
64
,
64
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
1
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
S
<
1
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
64
,
32
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
2
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
S
<
2
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
2
,
256
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
2
,
256
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
0
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
0
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
2
,
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
0
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
0
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
2
,
256
,
32
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
2
,
64
,
32
,
32
,
64
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
1
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
S
<
1
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
2
,
128
,
64
,
32
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
2
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
S
<
2
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
1
,
256
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
2
,
256
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
S
<
4
,
16
,
4
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
// clang-format on
// clang-format on
>
;
>
;
...
...
library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_kn_mn_instance.cpp
View file @
bc5b84b1
...
@@ -32,7 +32,8 @@ using device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_kn_mn_instances =
...
@@ -32,7 +32,8 @@ using device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_kn_mn_instances =
// ##################################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| SrcAccessOrder| SrcVectorDim| Scalar| AddExtraM| ThreadCluster| SrcAccessOrder| SrcVectorDim| Scalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ##################################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| SrcAccessOrder| SrcVectorDim| Scalar| AddExtraM| ThreadCluster| SrcAccessOrder| SrcVectorDim| Scalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ##################################| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| | | PerVector| | Lengths_K0_N_K1| | | PerVector| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ##################################| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| | | PerVector| | Lengths_K0_N_K1| | | PerVector| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ##################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// ##################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Col
,
Row
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
8
,
8
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
S
<
4
,
8
,
8
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Col
,
Row
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
8
,
8
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
S
<
4
,
8
,
8
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle_LdsDirectLoad
<
Col
,
Row
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
2
,
256
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
8
,
8
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
S
<
4
,
8
,
8
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
// clang-format on
// clang-format on
>
;
>
;
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment