Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
5d015452
Commit
5d015452
authored
Jul 06, 2022
by
Chaitanya Inumella
Browse files
Rebased the hipTENSOR development branch with the contraction branch
parents
b7fa6bb1
ed3feb4d
Changes
425
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
153 additions
and
121 deletions
+153
-121
include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
...m_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
+3
-0
include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
...m_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
+3
-0
include/ck/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
...orm_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
+3
-0
include/ck/stream_config.hpp
include/ck/stream_config.hpp
+3
-0
include/ck/tensor/static_tensor.hpp
include/ck/tensor/static_tensor.hpp
+3
-0
include/ck/tensor_description/cluster_descriptor.hpp
include/ck/tensor_description/cluster_descriptor.hpp
+6
-5
include/ck/tensor_description/multi_index_transform.hpp
include/ck/tensor_description/multi_index_transform.hpp
+6
-5
include/ck/tensor_description/multi_index_transform_helper.hpp
...de/ck/tensor_description/multi_index_transform_helper.hpp
+6
-5
include/ck/tensor_description/tensor_adaptor.hpp
include/ck/tensor_description/tensor_adaptor.hpp
+11
-6
include/ck/tensor_description/tensor_descriptor.hpp
include/ck/tensor_description/tensor_descriptor.hpp
+13
-5
include/ck/tensor_description/tensor_descriptor_helper.hpp
include/ck/tensor_description/tensor_descriptor_helper.hpp
+7
-3
include/ck/tensor_description/tensor_space_filling_curve.hpp
include/ck/tensor_description/tensor_space_filling_curve.hpp
+10
-9
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
.../ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+8
-4
include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
.../tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
+3
-0
include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
...ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
+3
-0
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
...e/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+16
-13
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
...ration/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+9
-8
include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
...sor_operation/gpu/block/reduction_functions_blockwise.hpp
+22
-48
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
...ion/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+9
-5
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
...ion/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
+9
-5
No files found.
Too many changes to show.
To preserve performance only
425 of 425+
files are displayed.
Plain diff
Email patch
include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP
#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP
...
...
include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
...
...
include/ck/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
...
...
include/ck/stream_config.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <hip/hip_runtime.h>
...
...
include/ck/tensor/static_tensor.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_STATIC_TENSOR_HPP
#define CK_STATIC_TENSOR_HPP
...
...
include/ck/tensor_description/cluster_descriptor.hpp
View file @
5d015452
#ifndef CK_CLUSTER_DESCRIPTOR_HPP
#define CK_CLUSTER_DESCRIPTOR_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#include "tensor_adaptor.hpp"
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
namespace
ck
{
...
...
@@ -30,4 +32,3 @@ __host__ __device__ constexpr auto make_cluster_descriptor(
}
}
// namespace ck
#endif
include/ck/tensor_description/multi_index_transform.hpp
View file @
5d015452
#ifndef CK_MULTI_INDEX_TRANSFORM_HPP
#define CK_MULTI_INDEX_TRANSFORM_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#include "multi_index.hpp"
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/utility/multi_index.hpp"
namespace
ck
{
...
...
@@ -1950,4 +1952,3 @@ struct Modulo
}
};
}
// namespace ck
#endif
include/ck/tensor_description/multi_index_transform_helper.hpp
View file @
5d015452
#ifndef CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
#define CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#include "multi_index_transform.hpp"
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/multi_index_transform.hpp"
namespace
ck
{
...
...
@@ -126,4 +128,3 @@ __host__ __device__ constexpr auto make_modulo_transform(const Modulus& modulus,
return
Modulo
<
Modulus
,
UpLength
>
{
modulus
,
up_length
};
}
}
// namespace ck
#endif
include/ck/tensor_description/tensor_adaptor.hpp
View file @
5d015452
#ifndef CK_TENSOR_ADAPTOR_HPP
#define CK_TENSOR_ADAPTOR_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
namespace
ck
{
...
...
@@ -136,7 +138,11 @@ struct TensorAdaptor
using
ElementSize
=
remove_cv_t
<
decltype
(
InitializeElementSize
(
Transforms
{}))
>
;
public:
#if 0 // workaround compiler complaint about constexpr
__host__ __device__ constexpr TensorAdaptor() = default;
#else
__host__
__device__
constexpr
TensorAdaptor
()
:
transforms_
{},
element_size_
{}
{}
#endif
__host__
__device__
constexpr
TensorAdaptor
(
const
Transforms
&
transforms
)
:
transforms_
{
transforms
},
element_size_
{
InitializeElementSize
(
transforms
)}
...
...
@@ -474,4 +480,3 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
}
}
// namespace ck
#endif
include/ck/tensor_description/tensor_descriptor.hpp
View file @
5d015452
#ifndef CK_TENSOR_DESCRIPTOR_HPP
#define CK_TENSOR_DESCRIPTOR_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#include "multi_index_transform.hpp"
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/multi_index_transform.hpp"
namespace
ck
{
...
...
@@ -111,7 +113,14 @@ struct TensorDescriptor
using
ElementSize
=
remove_cv_t
<
decltype
(
InitializeElementSize
(
Transforms
{}))
>
;
public:
#if 0 // workaround compiler complaint about constexpr
__host__ __device__ constexpr TensorDescriptor() = default;
#else
__host__
__device__
constexpr
TensorDescriptor
()
:
transforms_
{},
element_size_
{},
element_space_size_
{}
{
}
#endif
__host__
__device__
constexpr
TensorDescriptor
(
const
Transforms
&
transforms
,
ElementSpaceSize
element_space_size
)
...
...
@@ -597,4 +606,3 @@ using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step(
TensorDesc
{},
MultiIndex
<
remove_cvref_t
<
TensorDesc
>::
GetNumOfDimension
()
>
{}));
}
// namespace ck
#endif
include/ck/tensor_description/tensor_descriptor_helper.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "multi_index_transform_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
namespace
ck
{
...
...
include/ck/
utility
/tensor_space_filling_curve.hpp
→
include/ck/
tensor_description
/tensor_space_filling_curve.hpp
View file @
5d015452
#ifndef TENSOR_SPACE_FILLING_CURVE_HPP
#define TENSOR_SPACE_FILLING_CURVE_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "math.hpp"
#include "sequence.hpp"
#include "sequence_helper.hpp"
#include "tensor_adaptor.hpp"
#include "statically_indexed_array_multi_index.hpp"
#include "tuple_helper.hpp"
#pragma once
#include "ck/utility/math.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/utility/sequence_helper.hpp"
#include "ck/utility/statically_indexed_array_multi_index.hpp"
#include "ck/utility/tuple_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
namespace
ck
{
...
...
@@ -156,4 +158,3 @@ struct SpaceFillingCurve
};
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "common_header.hpp"
#include "tensor_adaptor.hpp"
#include "threadwise_tensor_slice_transfer_v4r1.hpp"
#include "threadwise_contraction_dl.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp"
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
#define CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
#define CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "common_header.hpp"
#include "
threadwise_tensor_slice_transf
er.hpp"
#include "
xdlops_gemm
.hpp"
#include "tensor_
adaptor
.hpp"
#include "
thread_group
.hpp"
#include "
ck/utility/common_head
er.hpp"
#include "
ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer
.hpp"
#include "
ck/
tensor_
operation/gpu/warp/xdlops_gemm
.hpp"
#include "
ck/tensor_description/tensor_adaptor
.hpp"
namespace
ck
{
...
...
@@ -438,7 +441,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
make_tuple
(
n0
,
I0
,
I0
,
I0
),
b_thread_buf
);
});
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
// NOTE: Synchronize threads in a workgroup at the start of each MAC cluster, but except
// the first, as we can shorten non-MAC cluster a bit and there's no observable negative
// impact. The desired effect is waves in a workgroup executing MAC in sync. This avoids
...
...
@@ -448,7 +451,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
if
constexpr
(
k
.
value
!=
0
||
KPerInnerLoop
==
KPerThread
)
{
asm
volatile
(
"s_barrier"
::
);
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
}
static_for
<
0
,
KPerInnerLoop
,
KPack
>
{}([
&
](
auto
k_
)
{
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
...
...
@@ -480,9 +483,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
k_
.
value
==
KPerInnerLoop
-
KPack
&&
m0
.
value
==
MRepeat
-
1
&&
n0
.
value
==
NRepeat
-
1
)
{
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
block_sync_lds
();
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
}
// TODO: insert setprio in more precise manner since we
...
...
@@ -493,16 +496,16 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
c_thread_buf
.
GetVectorTypeReference
(
Number
<
c_offset
>{}));
if
constexpr
(
k_
.
value
==
0
&&
m0
.
value
==
0
&&
n0
.
value
==
0
)
{
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
__builtin_amdgcn_s_setprio
(
1
);
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
}
});
});
});
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
__builtin_amdgcn_s_setprio
(
0
);
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
});
}
...
...
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
View file @
5d015452
#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_v5r1.hpp"
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp"
namespace
ck
{
...
...
@@ -152,4 +154,3 @@ struct BlockwiseTensorSliceTransfer_v5r1
};
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
View file @
5d015452
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#include "reduction_common.hpp"
#include "reduction_functions_accumulate.hpp"
#include "cluster_descriptor.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/utility/reduction_common.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
namespace
ck
{
...
...
@@ -45,7 +21,9 @@ template <typename AccDataType,
typename
ThreadClusterLengths_M_K
,
typename
ThreadClusterArrangeOrder
,
typename
OpReduce
,
bool
PropagateNan
>
bool
PropagateNan
,
typename
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
>
>
struct
PartitionedBlockwiseReduction
{
static_assert
(
BlockSize
==
ThreadClusterLengths_M_K
::
At
(
0
)
*
ThreadClusterLengths_M_K
::
At
(
1
),
...
...
@@ -62,8 +40,6 @@ struct PartitionedBlockwiseReduction
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
using
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
>
;
template
<
typename
BufferType
>
__device__
static
void
Reduce
(
BufferType
&
work_buffer
,
AccDataType
&
in_out_value
)
{
...
...
@@ -113,13 +89,16 @@ struct PartitionedBlockwiseReduction
// 3) in_out_value/in_out_index is the input data in vgpr from each thread
// 4) in_out_value/in_out_index is the over-written reduced output in vgpr for each thread
// clang-format on
template
<
typename
AccDataType
,
typename
IndexDataType
,
index_t
BlockSize
,
typename
ThreadClusterLengths_M_K
,
typename
ThreadClusterArrangeOrder
,
typename
OpReduce
,
bool
PropagateNan
>
template
<
typename
AccDataType
,
typename
IndexDataType
,
index_t
BlockSize
,
typename
ThreadClusterLengths_M_K
,
typename
ThreadClusterArrangeOrder
,
typename
OpReduce
,
bool
PropagateNan
,
typename
Accumulation
=
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
,
IndexDataType
>
>
struct
PartitionedBlockwiseReductionWithIndex
{
static_assert
(
BlockSize
==
ThreadClusterLengths_M_K
::
At
(
0
)
*
ThreadClusterLengths_M_K
::
At
(
1
),
...
...
@@ -136,9 +115,6 @@ struct PartitionedBlockwiseReductionWithIndex
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
using
Accumulation
=
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
,
IndexDataType
>
;
// This interface accumulates on both data values and indices
template
<
typename
BufferType
,
typename
IdxBufferType
>
__device__
static
void
Reduce
(
BufferType
&
work_val_buffer
,
...
...
@@ -193,6 +169,4 @@ struct PartitionedBlockwiseReductionWithIndex
};
};
};
// end of namespace ck
#endif
}
// namespace ck
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_v3r1.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp"
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_v6r1.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp"
namespace
ck
{
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment