Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b79df771
Commit
b79df771
authored
Jul 12, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
05d38218
63914743
Changes
450
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
159 additions
and
126 deletions
+159
-126
include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
...m_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
+3
-0
include/ck/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
...orm_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
+3
-0
include/ck/stream_config.hpp
include/ck/stream_config.hpp
+3
-0
include/ck/tensor/static_tensor.hpp
include/ck/tensor/static_tensor.hpp
+3
-0
include/ck/tensor_description/cluster_descriptor.hpp
include/ck/tensor_description/cluster_descriptor.hpp
+6
-5
include/ck/tensor_description/multi_index_transform.hpp
include/ck/tensor_description/multi_index_transform.hpp
+6
-5
include/ck/tensor_description/multi_index_transform_helper.hpp
...de/ck/tensor_description/multi_index_transform_helper.hpp
+6
-5
include/ck/tensor_description/tensor_adaptor.hpp
include/ck/tensor_description/tensor_adaptor.hpp
+11
-6
include/ck/tensor_description/tensor_descriptor.hpp
include/ck/tensor_description/tensor_descriptor.hpp
+13
-5
include/ck/tensor_description/tensor_descriptor_helper.hpp
include/ck/tensor_description/tensor_descriptor_helper.hpp
+7
-3
include/ck/tensor_description/tensor_space_filling_curve.hpp
include/ck/tensor_description/tensor_space_filling_curve.hpp
+10
-9
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
.../ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+8
-4
include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
.../tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
+3
-0
include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
...ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
+3
-0
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
...e/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+16
-13
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
...ration/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+9
-8
include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
...sor_operation/gpu/block/reduction_functions_blockwise.hpp
+22
-48
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
...ion/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+9
-5
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
...ion/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
+9
-5
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
...ion/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
+9
-5
No files found.
Too many changes to show.
To preserve performance only
450 of 450+
files are displayed.
Plain diff
Email patch
include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
...
...
include/ck/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
...
...
include/ck/stream_config.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#ifndef CK_NOGPU
#ifndef CK_NOGPU
...
...
include/ck/tensor/static_tensor.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_STATIC_TENSOR_HPP
#ifndef CK_STATIC_TENSOR_HPP
#define CK_STATIC_TENSOR_HPP
#define CK_STATIC_TENSOR_HPP
...
...
include/ck/tensor_description/cluster_descriptor.hpp
View file @
b79df771
#ifndef CK_CLUSTER_DESCRIPTOR_HPP
// SPDX-License-Identifier: MIT
#define CK_CLUSTER_DESCRIPTOR_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "tensor_adaptor.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -30,4 +32,3 @@ __host__ __device__ constexpr auto make_cluster_descriptor(
...
@@ -30,4 +32,3 @@ __host__ __device__ constexpr auto make_cluster_descriptor(
}
}
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_description/multi_index_transform.hpp
View file @
b79df771
#ifndef CK_MULTI_INDEX_TRANSFORM_HPP
// SPDX-License-Identifier: MIT
#define CK_MULTI_INDEX_TRANSFORM_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "multi_index.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/utility/multi_index.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -1950,4 +1952,3 @@ struct Modulo
...
@@ -1950,4 +1952,3 @@ struct Modulo
}
}
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_description/multi_index_transform_helper.hpp
View file @
b79df771
#ifndef CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
// SPDX-License-Identifier: MIT
#define CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "multi_index_transform.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/multi_index_transform.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -126,4 +128,3 @@ __host__ __device__ constexpr auto make_modulo_transform(const Modulus& modulus,
...
@@ -126,4 +128,3 @@ __host__ __device__ constexpr auto make_modulo_transform(const Modulus& modulus,
return
Modulo
<
Modulus
,
UpLength
>
{
modulus
,
up_length
};
return
Modulo
<
Modulus
,
UpLength
>
{
modulus
,
up_length
};
}
}
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_description/tensor_adaptor.hpp
View file @
b79df771
#ifndef CK_TENSOR_ADAPTOR_HPP
// SPDX-License-Identifier: MIT
#define CK_TENSOR_ADAPTOR_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -136,7 +138,11 @@ struct TensorAdaptor
...
@@ -136,7 +138,11 @@ struct TensorAdaptor
using
ElementSize
=
remove_cv_t
<
decltype
(
InitializeElementSize
(
Transforms
{}))
>
;
using
ElementSize
=
remove_cv_t
<
decltype
(
InitializeElementSize
(
Transforms
{}))
>
;
public:
public:
#if 0 // workaround compiler complaint about constexpr
__host__ __device__ constexpr TensorAdaptor() = default;
__host__ __device__ constexpr TensorAdaptor() = default;
#else
__host__
__device__
constexpr
TensorAdaptor
()
:
transforms_
{},
element_size_
{}
{}
#endif
__host__
__device__
constexpr
TensorAdaptor
(
const
Transforms
&
transforms
)
__host__
__device__
constexpr
TensorAdaptor
(
const
Transforms
&
transforms
)
:
transforms_
{
transforms
},
element_size_
{
InitializeElementSize
(
transforms
)}
:
transforms_
{
transforms
},
element_size_
{
InitializeElementSize
(
transforms
)}
...
@@ -474,4 +480,3 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
...
@@ -474,4 +480,3 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
}
}
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_description/tensor_descriptor.hpp
View file @
b79df771
#ifndef CK_TENSOR_DESCRIPTOR_HPP
// SPDX-License-Identifier: MIT
#define CK_TENSOR_DESCRIPTOR_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "multi_index_transform.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/multi_index_transform.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -111,7 +113,14 @@ struct TensorDescriptor
...
@@ -111,7 +113,14 @@ struct TensorDescriptor
using
ElementSize
=
remove_cv_t
<
decltype
(
InitializeElementSize
(
Transforms
{}))
>
;
using
ElementSize
=
remove_cv_t
<
decltype
(
InitializeElementSize
(
Transforms
{}))
>
;
public:
public:
#if 0 // workaround compiler complaint about constexpr
__host__ __device__ constexpr TensorDescriptor() = default;
__host__ __device__ constexpr TensorDescriptor() = default;
#else
__host__
__device__
constexpr
TensorDescriptor
()
:
transforms_
{},
element_size_
{},
element_space_size_
{}
{
}
#endif
__host__
__device__
constexpr
TensorDescriptor
(
const
Transforms
&
transforms
,
__host__
__device__
constexpr
TensorDescriptor
(
const
Transforms
&
transforms
,
ElementSpaceSize
element_space_size
)
ElementSpaceSize
element_space_size
)
...
@@ -602,4 +611,3 @@ using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step(
...
@@ -602,4 +611,3 @@ using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step(
TensorDesc
{},
MultiIndex
<
remove_cvref_t
<
TensorDesc
>::
GetNumOfDimension
()
>
{}));
TensorDesc
{},
MultiIndex
<
remove_cvref_t
<
TensorDesc
>::
GetNumOfDimension
()
>
{}));
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_description/tensor_descriptor_helper.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "ck/utility/common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/
utility
/tensor_space_filling_curve.hpp
→
include/ck/
tensor_description
/tensor_space_filling_curve.hpp
View file @
b79df771
#ifndef TENSOR_SPACE_FILLING_CURVE_HPP
// SPDX-License-Identifier: MIT
#define TENSOR_SPACE_FILLING_CURVE_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "math.hpp"
#pragma once
#include "sequence.hpp"
#include "sequence_helper.hpp"
#include "ck/utility/math.hpp"
#include "tensor_adaptor.hpp"
#include "ck/utility/sequence.hpp"
#include "statically_indexed_array_multi_index.hpp"
#include "ck/utility/sequence_helper.hpp"
#include "tuple_helper.hpp"
#include "ck/utility/statically_indexed_array_multi_index.hpp"
#include "ck/utility/tuple_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -156,4 +158,3 @@ struct SpaceFillingCurve
...
@@ -156,4 +158,3 @@ struct SpaceFillingCurve
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "tensor_adaptor.hpp"
#include "ck/utility/common_header.hpp"
#include "threadwise_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "threadwise_contraction_dl.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
#define CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
#define CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
#ifndef CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
#define CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
#define CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "
threadwise_tensor_slice_transf
er.hpp"
#include "
ck/utility/common_head
er.hpp"
#include "
xdlops_gemm
.hpp"
#include "
ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer
.hpp"
#include "tensor_
adaptor
.hpp"
#include "
ck/
tensor_
operation/gpu/warp/xdlops_gemm
.hpp"
#include "
thread_group
.hpp"
#include "
ck/tensor_description/tensor_adaptor
.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -438,7 +441,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -438,7 +441,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
make_tuple
(
n0
,
I0
,
I0
,
I0
),
make_tuple
(
n0
,
I0
,
I0
,
I0
),
b_thread_buf
);
b_thread_buf
);
});
});
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
// NOTE: Synchronize threads in a workgroup at the start of each MAC cluster, but except
// NOTE: Synchronize threads in a workgroup at the start of each MAC cluster, but except
// the first, as we can shorten non-MAC cluster a bit and there's no observable negative
// the first, as we can shorten non-MAC cluster a bit and there's no observable negative
// impact. The desired effect is waves in a workgroup executing MAC in sync. This avoids
// impact. The desired effect is waves in a workgroup executing MAC in sync. This avoids
...
@@ -448,7 +451,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -448,7 +451,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
if
constexpr
(
k
.
value
!=
0
||
KPerInnerLoop
==
KPerThread
)
if
constexpr
(
k
.
value
!=
0
||
KPerInnerLoop
==
KPerThread
)
{
{
asm
volatile
(
"s_barrier"
::
);
asm
volatile
(
"s_barrier"
::
);
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
}
}
static_for
<
0
,
KPerInnerLoop
,
KPack
>
{}([
&
](
auto
k_
)
{
static_for
<
0
,
KPerInnerLoop
,
KPack
>
{}([
&
](
auto
k_
)
{
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
...
@@ -480,9 +483,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -480,9 +483,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
k_
.
value
==
KPerInnerLoop
-
KPack
&&
m0
.
value
==
MRepeat
-
1
&&
k_
.
value
==
KPerInnerLoop
-
KPack
&&
m0
.
value
==
MRepeat
-
1
&&
n0
.
value
==
NRepeat
-
1
)
n0
.
value
==
NRepeat
-
1
)
{
{
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
block_sync_lds
();
block_sync_lds
();
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
}
}
// TODO: insert setprio in more precise manner since we
// TODO: insert setprio in more precise manner since we
...
@@ -493,16 +496,16 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -493,16 +496,16 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
c_thread_buf
.
GetVectorTypeReference
(
Number
<
c_offset
>{}));
c_thread_buf
.
GetVectorTypeReference
(
Number
<
c_offset
>{}));
if
constexpr
(
k_
.
value
==
0
&&
m0
.
value
==
0
&&
n0
.
value
==
0
)
if
constexpr
(
k_
.
value
==
0
&&
m0
.
value
==
0
&&
n0
.
value
==
0
)
{
{
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
__builtin_amdgcn_s_setprio
(
1
);
__builtin_amdgcn_s_setprio
(
1
);
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
}
}
});
});
});
});
});
});
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
__builtin_amdgcn_s_setprio
(
0
);
__builtin_amdgcn_s_setprio
(
0
);
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_sched_barrier
(
0
);
});
});
}
}
...
...
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
View file @
b79df771
#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
// SPDX-License-Identifier: MIT
#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "cluster_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_v5r1.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -152,4 +154,3 @@ struct BlockwiseTensorSliceTransfer_v5r1
...
@@ -152,4 +154,3 @@ struct BlockwiseTensorSliceTransfer_v5r1
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
View file @
b79df771
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
#pragma once
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
#include "ck/tensor_description/cluster_descriptor.hpp"
* Permission is hereby granted, free of charge, to any person obtaining a copy
#include "ck/utility/reduction_common.hpp"
* of this software and associated documentation files (the "Software"), to deal
#include "ck/utility/reduction_functions_accumulate.hpp"
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#include "reduction_common.hpp"
#include "reduction_functions_accumulate.hpp"
#include "cluster_descriptor.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -45,7 +21,9 @@ template <typename AccDataType,
...
@@ -45,7 +21,9 @@ template <typename AccDataType,
typename
ThreadClusterLengths_M_K
,
typename
ThreadClusterLengths_M_K
,
typename
ThreadClusterArrangeOrder
,
typename
ThreadClusterArrangeOrder
,
typename
OpReduce
,
typename
OpReduce
,
bool
PropagateNan
>
bool
PropagateNan
,
typename
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
>
>
struct
PartitionedBlockwiseReduction
struct
PartitionedBlockwiseReduction
{
{
static_assert
(
BlockSize
==
ThreadClusterLengths_M_K
::
At
(
0
)
*
ThreadClusterLengths_M_K
::
At
(
1
),
static_assert
(
BlockSize
==
ThreadClusterLengths_M_K
::
At
(
0
)
*
ThreadClusterLengths_M_K
::
At
(
1
),
...
@@ -62,8 +40,6 @@ struct PartitionedBlockwiseReduction
...
@@ -62,8 +40,6 @@ struct PartitionedBlockwiseReduction
static
constexpr
auto
thread_cluster_desc
=
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
using
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
>
;
template
<
typename
BufferType
>
template
<
typename
BufferType
>
__device__
static
void
Reduce
(
BufferType
&
work_buffer
,
AccDataType
&
in_out_value
)
__device__
static
void
Reduce
(
BufferType
&
work_buffer
,
AccDataType
&
in_out_value
)
{
{
...
@@ -113,13 +89,16 @@ struct PartitionedBlockwiseReduction
...
@@ -113,13 +89,16 @@ struct PartitionedBlockwiseReduction
// 3) in_out_value/in_out_index is the input data in vgpr from each thread
// 3) in_out_value/in_out_index is the input data in vgpr from each thread
// 4) in_out_value/in_out_index is the over-written reduced output in vgpr for each thread
// 4) in_out_value/in_out_index is the over-written reduced output in vgpr for each thread
// clang-format on
// clang-format on
template
<
typename
AccDataType
,
template
<
typename
IndexDataType
,
typename
AccDataType
,
index_t
BlockSize
,
typename
IndexDataType
,
typename
ThreadClusterLengths_M_K
,
index_t
BlockSize
,
typename
ThreadClusterArrangeOrder
,
typename
ThreadClusterLengths_M_K
,
typename
OpReduce
,
typename
ThreadClusterArrangeOrder
,
bool
PropagateNan
>
typename
OpReduce
,
bool
PropagateNan
,
typename
Accumulation
=
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
,
IndexDataType
>
>
struct
PartitionedBlockwiseReductionWithIndex
struct
PartitionedBlockwiseReductionWithIndex
{
{
static_assert
(
BlockSize
==
ThreadClusterLengths_M_K
::
At
(
0
)
*
ThreadClusterLengths_M_K
::
At
(
1
),
static_assert
(
BlockSize
==
ThreadClusterLengths_M_K
::
At
(
0
)
*
ThreadClusterLengths_M_K
::
At
(
1
),
...
@@ -136,9 +115,6 @@ struct PartitionedBlockwiseReductionWithIndex
...
@@ -136,9 +115,6 @@ struct PartitionedBlockwiseReductionWithIndex
static
constexpr
auto
thread_cluster_desc
=
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
using
Accumulation
=
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
,
IndexDataType
>
;
// This interface accumulates on both data values and indices
// This interface accumulates on both data values and indices
template
<
typename
BufferType
,
typename
IdxBufferType
>
template
<
typename
BufferType
,
typename
IdxBufferType
>
__device__
static
void
Reduce
(
BufferType
&
work_val_buffer
,
__device__
static
void
Reduce
(
BufferType
&
work_val_buffer
,
...
@@ -193,6 +169,4 @@ struct PartitionedBlockwiseReductionWithIndex
...
@@ -193,6 +169,4 @@ struct PartitionedBlockwiseReductionWithIndex
};
};
};
};
};
// end of namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "cluster_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "threadwise_tensor_slice_transfer_v3r1.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "cluster_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "threadwise_tensor_slice_transfer_v6r1.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "cluster_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "threadwise_tensor_slice_transfer_v6r2.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp"
namespace
ck
{
namespace
ck
{
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment