Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
b79df771
Commit
b79df771
authored
Jul 12, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
05d38218
63914743
Changes
450
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
457 additions
and
72 deletions
+457
-72
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
...nsor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
+7
-6
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
...operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+25
-9
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+8
-7
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
+3
-0
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp
+7
-6
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
+7
-3
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
+9
-8
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
+8
-7
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
+8
-7
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp
...ration/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp
+298
-0
include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+7
-6
include/ck/utility/amd_address_space.hpp
include/ck/utility/amd_address_space.hpp
+5
-4
include/ck/utility/amd_buffer_addressing.hpp
include/ck/utility/amd_buffer_addressing.hpp
+5
-0
include/ck/utility/amd_inline_asm.hpp
include/ck/utility/amd_inline_asm.hpp
+3
-0
include/ck/utility/amd_llvm_intrinsic.hpp
include/ck/utility/amd_llvm_intrinsic.hpp
+3
-0
include/ck/utility/amd_xdlops.hpp
include/ck/utility/amd_xdlops.hpp
+3
-0
include/ck/utility/array.hpp
include/ck/utility/array.hpp
+3
-0
include/ck/utility/array_multi_index.hpp
include/ck/utility/array_multi_index.hpp
+3
-0
include/ck/utility/c_style_pointer_cast.hpp
include/ck/utility/c_style_pointer_cast.hpp
+3
-0
include/ck/utility/common_header.hpp
include/ck/utility/common_header.hpp
+42
-9
No files found.
Too many changes to show.
To preserve performance only
450 of 450+
files are displayed.
Plain diff
Email patch
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
View file @
b79df771
#ifndef CK_THREADWISE_TENSOR_SET_HPP
// SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SET_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -56,4 +58,3 @@ struct ThreadwiseTensorSliceSet_v1
...
@@ -56,4 +58,3 @@ struct ThreadwiseTensorSliceSet_v1
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
View file @
b79df771
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP
// SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_space_filling_curve.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -234,9 +236,14 @@ template <typename SrcData,
...
@@ -234,9 +236,14 @@ template <typename SrcData,
index_t
SrcScalarPerVector
,
index_t
SrcScalarPerVector
,
index_t
SrcScalarStrideInVector
,
index_t
SrcScalarStrideInVector
,
bool
SrcResetCoordinateAfterRun
,
bool
SrcResetCoordinateAfterRun
,
bool
InvalidElementAsNaN
=
false
,
typename
enable_if
<
DstDesc
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
typename
enable_if
<
DstDesc
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
struct
ThreadwiseTensorSliceTransfer_v2
struct
ThreadwiseTensorSliceTransfer_v2
{
{
static_assert
((
InvalidElementAsNaN
&&
!
std
::
is_integral
<
DstData
>::
value
)
||
(
!
InvalidElementAsNaN
),
"Filling invalid element as NaN is only for floating point types"
);
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
Index
=
MultiIndex
<
nDim
>
;
...
@@ -316,8 +323,18 @@ struct ThreadwiseTensorSliceTransfer_v2
...
@@ -316,8 +323,18 @@ struct ThreadwiseTensorSliceTransfer_v2
dst_desc
.
CalculateOffset
(
to_multi_index
(
dst_slice_origin_idx
)
+
src_data_idx
+
dst_desc
.
CalculateOffset
(
to_multi_index
(
dst_slice_origin_idx
)
+
src_data_idx
+
i
*
src_scalar_step_in_vector
);
i
*
src_scalar_step_in_vector
);
dst_buf
(
Number
<
dst_offset
>
{})
=
if
constexpr
(
InvalidElementAsNaN
)
type_convert
<
DstData
>
(
src_vector
.
template
AsType
<
SrcData
>()[
i
]);
{
dst_buf
(
Number
<
dst_offset
>
{})
=
is_src_valid
?
type_convert
<
DstData
>
(
src_vector
.
template
AsType
<
SrcData
>()[
i
])
:
NumericLimits
<
DstData
>::
QuietNaN
();
}
else
{
dst_buf
(
Number
<
dst_offset
>
{})
=
type_convert
<
DstData
>
(
src_vector
.
template
AsType
<
SrcData
>()[
i
]);
}
});
});
if
constexpr
(
idx_1d
.
value
!=
num_access
-
1
)
if
constexpr
(
idx_1d
.
value
!=
num_access
-
1
)
...
@@ -1168,4 +1185,3 @@ struct ThreadwiseTensorSliceTransfer_v4
...
@@ -1168,4 +1185,3 @@ struct ThreadwiseTensorSliceTransfer_v4
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
View file @
b79df771
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R1_HPP
// SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R1_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "static_tensor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor/static_tensor.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -789,4 +791,3 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -789,4 +791,3 @@ struct ThreadwiseTensorSliceTransfer_v3r1
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp
View file @
b79df771
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V4R1_HPP
// SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V4R1_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
namespace
ck
{
namespace
ck
{
// Assume:
// Assume:
...
@@ -171,4 +173,3 @@ struct ThreadwiseTensorSliceTransfer_v4r1
...
@@ -171,4 +173,3 @@ struct ThreadwiseTensorSliceTransfer_v4r1
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
View file @
b79df771
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R1_HPP
// SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R1_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_space_filling_curve.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -206,7 +208,6 @@ struct ThreadwiseTensorSliceTransfer_v6r1
...
@@ -206,7 +208,6 @@ struct ThreadwiseTensorSliceTransfer_v6r1
SrcCoord
src_coord_
;
SrcCoord
src_coord_
;
DstCoord
dst_coord_
;
DstCoord
dst_coord_
;
const
ElementwiseOperation
element_op_
;
const
ElementwiseOperation
element_op_
;
};
// namespace ck
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
View file @
b79df771
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R2_HPP
// SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R2_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_space_filling_curve.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -256,4 +258,3 @@ struct ThreadwiseTensorSliceTransfer_v6r2
...
@@ -256,4 +258,3 @@ struct ThreadwiseTensorSliceTransfer_v6r2
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
View file @
b79df771
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R3_HPP
// SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R3_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_space_filling_curve.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -306,4 +308,3 @@ struct ThreadwiseTensorSliceTransfer_v6r3
...
@@ -306,4 +308,3 @@ struct ThreadwiseTensorSliceTransfer_v6r3
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp
0 → 100644
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
namespace
ck
{
// Thread-level multi-source, multi-destination tensor slice data movement
// Assume:
// 1. All sources and destinations are DynamicBuffer
// 2. Same VectorDim and ScalerPerVector for all sources and destinations
// 3. DstInMemOps are per destination tensor
// 4. ThreadTransferSrcResetCoordinateAfterRunFlags are per source tensor
// 5. ThreadTransferDstResetCoordinateAfterRunFlags are per destination tensor
// 6. Does not need to know src_descs and dst_descs at compile-time
// 7. Does not need to know src_slice_origins and dst_slice_origins at compile-time,
//
// Does following things to avoid scratch memory issue
// 1. Use StaticallyIndexedArray or vector_type instead of C array for thread buffer
// 2. Pass tensor descritpors by reference (or tuple of references)
// 3. Does not keep reference to tensor descriptor
// 4. Does not construct new tensor coordinate when call Run()
template
<
typename
SrcDatas
,
typename
DstDatas
,
typename
SrcDescs
,
typename
DstDescs
,
typename
ElementwiseOperation
,
typename
DstInMemOps
,
// Sequence<InMemoryDataOperationEnum ...>
typename
SliceLengths
,
typename
DimAccessOrder
,
index_t
VectorDim
,
index_t
ScalarPerVector
,
typename
SrcResetCoordinateAfterRunFlags
,
// Sequence<bool ...>
typename
DstResetCoordinateAfterRunFlags
>
// Sequence<bool ...>
struct
ThreadwiseTensorSliceTransfer_v7
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
static
constexpr
index_t
nSrc
=
SrcDescs
::
Size
();
static
constexpr
index_t
nDst
=
DstDescs
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
// return a tuple of coordiantes for a tuple of tensor
template
<
typename
Descs
,
typename
Indices
,
enable_if_t
<
Descs
::
Size
()
==
Indices
::
Size
(),
bool
>
=
false
>
static
constexpr
auto
MakeCoordinates
(
const
Descs
&
descs
,
const
Indices
&
indices
)
{
return
generate_tuple
([
&
](
auto
i
)
{
return
make_tensor_coordinate
(
descs
[
i
],
indices
[
i
]);
},
Number
<
Descs
::
Size
()
>
{});
}
using
SrcCoords
=
decltype
(
MakeCoordinates
(
SrcDescs
{},
StaticallyIndexedArray
<
Index
,
nSrc
>
{}));
using
DstCoords
=
decltype
(
MakeCoordinates
(
DstDescs
{},
StaticallyIndexedArray
<
Index
,
nDst
>
{}));
// scalar per access on each dim
// FIXME: don't use lambda_scalar_per_access
static
constexpr
auto
scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
VectorDim
,
ScalarPerVector
>
{},
Number
<
nDim
>
{});
using
SpaceFillingCurve
=
SpaceFillingCurve
<
SliceLengths
,
DimAccessOrder
,
remove_cv_t
<
decltype
(
scalar_per_access
)
>>
;
__device__
constexpr
ThreadwiseTensorSliceTransfer_v7
(
const
SrcDescs
&
src_descs
,
const
StaticallyIndexedArray
<
Index
,
nSrc
>&
src_slice_origins
,
const
DstDescs
&
dst_descs
,
const
StaticallyIndexedArray
<
Index
,
nDst
>&
dst_slice_origins
,
const
ElementwiseOperation
&
element_op
)
:
src_coords_
(
MakeCoordinates
(
src_descs
,
src_slice_origins
)),
dst_coords_
(
MakeCoordinates
(
dst_descs
,
dst_slice_origins
)),
element_op_
(
element_op
)
{
static_assert
(
SliceLengths
::
At
(
Number
<
VectorDim
>
{})
%
ScalarPerVector
==
0
,
"wrong! cannot evenly divide"
);
}
template
<
typename
Indices
,
enable_if_t
<
SrcDescs
::
Size
()
==
Indices
::
Size
(),
bool
>
=
false
>
__device__
void
SetSrcSliceOrigins
(
const
SrcDescs
&
src_descs
,
const
Indices
&
src_slice_origin_idxs
)
{
static_for
<
0
,
nSrc
,
1
>
{}([
&
](
auto
i
)
{
src_coords_
(
i
)
=
make_tensor_coordinate
(
src_descs
[
i
],
src_slice_origin_idxs
[
i
]);
});
}
template
<
typename
Indices
,
enable_if_t
<
DstDescs
::
Size
()
==
Indices
::
Size
(),
bool
>
=
false
>
__device__
void
SetDstSliceOrigins
(
const
DstDescs
&
dst_descs
,
const
Indices
&
dst_slice_origin_idxs
)
{
static_for
<
0
,
nDst
,
1
>
{}([
&
](
auto
i
)
{
dst_coords_
(
i
)
=
make_tensor_coordinate
(
dst_descs
[
i
],
dst_slice_origin_idxs
[
i
]);
});
}
// SrcDescs: Tuple<const SrcDesc0&, const SrcDesc1&, ...>
// SrcBuffers: Tuple<const SrcBuffer0&, const SrcBuffer1&, ...>
// DstDescs: Tuple<const DstDesc0&, const DstDesc1&, ...>
// DstBuffers: Tuple<const DstBuffer0&, const DstBuffer1&, ...>
template
<
typename
SrcBuffers
,
typename
DstBuffers
,
enable_if_t
<
SrcDescs
::
Size
()
==
SrcBuffers
::
Size
()
&&
DstDescs
::
Size
()
==
DstBuffers
::
Size
(),
bool
>
=
false
>
__device__
void
Run
(
const
SrcDescs
&
src_descs
,
const
SrcBuffers
&
src_bufs
,
const
DstDescs
&
dst_descs
,
DstBuffers
dst_bufs
)
{
auto
generate_vectors
=
[
&
](
auto
data_types
)
{
constexpr
index_t
num
=
data_types
.
Size
();
return
generate_tuple
(
[
&
](
auto
i
)
{
using
DataType
=
remove_cvref_t
<
decltype
(
data_types
[
i
])
>
;
return
vector_type_maker_t
<
DataType
,
ScalarPerVector
>
{};
},
Number
<
num
>
{});
};
constexpr
auto
num_access
=
SpaceFillingCurve
::
GetNumOfAccess
();
// loop over space-filling curve
static_for
<
0
,
num_access
,
1
>
{}([
&
](
auto
iAccess
)
{
auto
src_vectors
=
generate_vectors
(
SrcDatas
{});
auto
dst_vectors
=
generate_vectors
(
DstDatas
{});
// copy data from src_bufs into src_vectors
static_for
<
0
,
nSrc
,
1
>
{}([
&
](
auto
i
)
{
using
src_vector_t
=
typename
remove_cvref_t
<
decltype
(
src_vectors
[
i
])
>::
type
;
const
bool
is_src_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_descs
[
i
],
src_coords_
[
i
]);
src_vectors
(
i
).
template
AsType
<
src_vector_t
>()(
I0
)
=
src_bufs
[
i
].
template
Get
<
src_vector_t
>(
src_coords_
[
i
].
GetOffset
(),
is_src_valid
);
});
// apply pointwise function
static_for
<
0
,
ScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
// get reference to src data
const
auto
src_data_refs
=
generate_tie
(
// return type should be lvalue
[
&
](
auto
iSrc
)
->
const
auto
&
{
using
SrcData
=
remove_cvref_t
<
tuple_element_t
<
iSrc
.
value
,
SrcDatas
>>
;
return
src_vectors
[
iSrc
].
template
AsType
<
SrcData
>()[
i
];
},
Number
<
nSrc
>
{});
// get reference to dst data
auto
dst_data_refs
=
generate_tie
(
// return type should be lvalue
[
&
](
auto
iDst
)
->
auto
&
{
using
DstData
=
remove_cvref_t
<
tuple_element_t
<
iDst
.
value
,
DstDatas
>>
;
return
dst_vectors
(
iDst
).
template
AsType
<
DstData
>()(
i
);
},
Number
<
nDst
>
{});
// apply pointwise function
// pointwise function signature:
// element_op_(dst_data_refs[I0],
// dst_data_refs[I1],
// ...,
// src_data_refs[I0],
// src_data_refs[I1],
// ...)
unpack2
(
element_op_
,
dst_data_refs
,
src_data_refs
);
});
// copy data from buf_vectors into dst_bufs
static_for
<
0
,
nDst
,
1
>
{}([
&
](
auto
i
)
{
using
dst_vector_t
=
typename
remove_cvref_t
<
decltype
(
dst_vectors
[
i
])
>::
type
;
const
bool
is_dst_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_descs
[
i
],
dst_coords_
[
i
]);
constexpr
InMemoryDataOperationEnum
DstInMemOp
=
static_cast
<
InMemoryDataOperationEnum
>
(
DstInMemOps
::
At
(
i
.
value
));
dst_bufs
(
i
).
template
Update
<
DstInMemOp
,
dst_vector_t
>(
dst_coords_
[
i
].
GetOffset
(),
is_dst_valid
,
dst_vectors
[
i
].
template
AsType
<
dst_vector_t
>()[
I0
]);
});
// move coordinate
if
constexpr
(
iAccess
.
value
!=
num_access
-
1
)
{
constexpr
auto
forward_step
=
SpaceFillingCurve
::
GetForwardStep
(
iAccess
);
static_for
<
0
,
nSrc
,
1
>
{}([
&
](
auto
i
)
{
move_tensor_coordinate
(
src_descs
[
i
],
src_coords_
(
i
),
make_tensor_coordinate_step
(
src_descs
[
i
],
forward_step
));
});
static_for
<
0
,
nDst
,
1
>
{}([
&
](
auto
i
)
{
move_tensor_coordinate
(
dst_descs
[
i
],
dst_coords_
(
i
),
make_tensor_coordinate_step
(
dst_descs
[
i
],
forward_step
));
});
}
});
// move coordinate back to slice origin (or not)
static_for
<
0
,
nSrc
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
SrcResetCoordinateAfterRunFlags
::
At
(
i
))
{
const
auto
src_reset_step
=
make_tensor_coordinate_step
(
src_descs
[
i
],
GetCoordinateResetStep
());
move_tensor_coordinate
(
src_descs
[
i
],
src_coords_
(
i
),
src_reset_step
);
}
});
static_for
<
0
,
nDst
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
DstResetCoordinateAfterRunFlags
::
At
(
i
))
{
const
auto
dst_reset_step
=
make_tensor_coordinate_step
(
dst_descs
[
i
],
GetCoordinateResetStep
());
move_tensor_coordinate
(
dst_descs
[
i
],
dst_coords_
(
i
),
dst_reset_step
);
}
});
}
__device__
static
constexpr
auto
GetCoordinateResetStep
()
{
constexpr
auto
num_access
=
SpaceFillingCurve
::
GetNumOfAccess
();
if
constexpr
(
num_access
==
0
)
{
return
typename
SpaceFillingCurve
::
Index
{};
}
else
{
constexpr
auto
reset_step
=
SpaceFillingCurve
::
GetStepBetween
(
Number
<
num_access
-
1
>
{},
Number
<
0
>
{});
return
reset_step
;
}
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
template
<
index_t
ISrc
>
__device__
void
MoveSrcSliceWindow
(
const
SrcDescs
&
src_descs
,
Number
<
ISrc
>
iSrc
,
const
Index
&
src_slice_origin_step_idx
)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
SrcResetCoordinateAfterRunFlags
::
At
(
iSrc
)
?
src_slice_origin_step_idx
:
src_slice_origin_step_idx
+
GetCoordinateResetStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
src_descs
[
iSrc
],
adjusted_step_idx
);
move_tensor_coordinate
(
src_descs
[
iSrc
],
src_coords_
(
iSrc
),
adjusted_step
);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
template
<
index_t
IDst
>
__device__
void
MoveDstSliceWindow
(
const
DstDescs
&
dst_descs
,
Number
<
IDst
>
iDst
,
const
Index
&
dst_slice_origin_step_idx
)
{
// if dst coord was not reset by Run(), then need to adjust the step here
const
auto
adjusted_step_idx
=
DstResetCoordinateAfterRunFlags
::
At
(
iDst
)
?
dst_slice_origin_step_idx
:
dst_slice_origin_step_idx
+
GetCoordinateResetStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
dst_descs
[
iDst
],
adjusted_step_idx
);
move_tensor_coordinate
(
dst_descs
[
iDst
],
dst_coords_
(
iDst
),
adjusted_step
);
}
private:
SrcCoords
src_coords_
;
DstCoords
dst_coords_
;
const
ElementwiseOperation
element_op_
;
};
}
// namespace ck
include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
View file @
b79df771
#ifndef CK_XDLOPS_GEMM_HPP
// SPDX-License-Identifier: MIT
#define CK_XDLOPS_GEMM_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "math.hpp"
#include "amd_xdlops.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/utility/math.hpp"
#include "ck/utility/amd_xdlops.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -786,4 +788,3 @@ struct XdlopsGemm
...
@@ -786,4 +788,3 @@ struct XdlopsGemm
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/utility/amd_address_space.hpp
View file @
b79df771
#ifndef CK_AMD_ADDRESS_SPACE_HPP
// SPDX-License-Identifier: MIT
#define CK_AMD_ADDRESS_SPACE_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "config.hpp"
#pragma once
#include "ck/ck.hpp"
#include "c_style_pointer_cast.hpp"
#include "c_style_pointer_cast.hpp"
// Address Space for AMDGCN
// Address Space for AMDGCN
...
@@ -41,4 +43,3 @@ __host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_addres
...
@@ -41,4 +43,3 @@ __host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_addres
}
}
}
// namespace ck
}
// namespace ck
#endif
include/ck/utility/amd_buffer_addressing.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "data_type.hpp"
#include "data_type.hpp"
...
@@ -7,6 +10,8 @@ namespace ck {
...
@@ -7,6 +10,8 @@ namespace ck {
template
<
typename
T
>
template
<
typename
T
>
union
BufferResource
union
BufferResource
{
{
__device__
constexpr
BufferResource
()
:
content
{}
{}
// 128 bit SGPRs to supply buffer resource in buffer instructions
// 128 bit SGPRs to supply buffer resource in buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
int32x4_t
content
;
int32x4_t
content
;
...
...
include/ck/utility/amd_inline_asm.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_AMD_INLINE_ASM_HPP
#ifndef CK_AMD_INLINE_ASM_HPP
#define CK_AMD_INLINE_ASM_HPP
#define CK_AMD_INLINE_ASM_HPP
...
...
include/ck/utility/amd_llvm_intrinsic.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_AMD_LLVM_INTRINSIC_HPP
#ifndef CK_AMD_LLVM_INTRINSIC_HPP
#define CK_AMD_LLVM_INTRINSIC_HPP
#define CK_AMD_LLVM_INTRINSIC_HPP
...
...
include/ck/utility/amd_xdlops.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_AMD_XDLOPS_HPP
#ifndef CK_AMD_XDLOPS_HPP
#define CK_AMD_XDLOPS_HPP
#define CK_AMD_XDLOPS_HPP
...
...
include/ck/utility/array.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_ARRAY_HPP
#ifndef CK_ARRAY_HPP
#define CK_ARRAY_HPP
#define CK_ARRAY_HPP
...
...
include/ck/utility/array_multi_index.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_ARRAY_MULTI_INDEX_HPP
#ifndef CK_ARRAY_MULTI_INDEX_HPP
#define CK_ARRAY_MULTI_INDEX_HPP
#define CK_ARRAY_MULTI_INDEX_HPP
...
...
include/ck/utility/c_style_pointer_cast.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_C_STYLE_POINTER_CAST_HPP
#ifndef CK_C_STYLE_POINTER_CAST_HPP
#define CK_C_STYLE_POINTER_CAST_HPP
#define CK_C_STYLE_POINTER_CAST_HPP
...
...
include/ck/utility/common_header.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "config.hpp"
#include "config.hpp"
#include "array.hpp"
#include "array.hpp"
...
@@ -31,19 +34,49 @@
...
@@ -31,19 +34,49 @@
#include "thread_group.hpp"
#include "thread_group.hpp"
#include "debug.hpp"
#include "debug.hpp"
#include "amd_buffer_addressing.hpp"
#include "ck/ck.hpp"
#include "generic_memory_space_atomic.hpp"
#include "ck/utility/array.hpp"
#include "get_id.hpp"
#include "ck/utility/container_helper.hpp"
#include "synchronization.hpp"
#include "ck/utility/statically_indexed_array.hpp"
#include "amd_address_space.hpp"
#include "ck/utility/container_element_picker.hpp"
#include "static_buffer.hpp"
#include "ck/utility/multi_index.hpp"
#include "dynamic_buffer.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/functional.hpp"
#include "ck/utility/functional2.hpp"
#include "ck/utility/functional3.hpp"
#include "ck/utility/functional4.hpp"
#include "ck/utility/enable_if.hpp"
#include "ck/utility/ignore.hpp"
#include "ck/utility/integral_constant.hpp"
#include "ck/utility/math.hpp"
#include "ck/utility/number.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/utility/sequence_helper.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/tuple_helper.hpp"
#include "ck/utility/type.hpp"
#include "ck/utility/magic_division.hpp"
#include "ck/utility/c_style_pointer_cast.hpp"
#include "ck/utility/is_known_at_compile_time.hpp"
#include "ck/utility/transpose_vectors.hpp"
#include "ck/utility/inner_product.hpp"
#include "ck/utility/thread_group.hpp"
#include "ck/utility/debug.hpp"
#include "ck/utility/amd_buffer_addressing.hpp"
#include "ck/utility/generic_memory_space_atomic.hpp"
#include "ck/utility/get_id.hpp"
#include "ck/utility/thread_group.hpp"
#include "ck/utility/synchronization.hpp"
#include "ck/utility/amd_address_space.hpp"
#include "ck/utility/static_buffer.hpp"
#include "ck/utility/dynamic_buffer.hpp"
// TODO: remove this
// TODO: remove this
#if CK_USE_AMD_INLINE_ASM
#if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp"
#include "
ck/utility/
amd_inline_asm.hpp"
#endif
#endif
#ifdef CK_USE_AMD_MFMA
#ifdef CK_USE_AMD_MFMA
#include "amd_xdlops.hpp"
#include "
ck/utility/
amd_xdlops.hpp"
#endif
#endif
Prev
1
…
8
9
10
11
12
13
14
15
16
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment