Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
cba8f7f2
Commit
cba8f7f2
authored
Jun 26, 2022
by
Anthony Chang
Browse files
Merge remote-tracking branch 'upstream/develop' into gemm-layernorm-4
parents
cc50b687
b653c5eb
Changes
583
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
594 additions
and
202 deletions
+594
-202
include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
+6
-2
include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
...eration/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+16
-11
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
...nsor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+15
-10
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
...or_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+14
-10
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+13
-9
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+16
-13
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
...tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+17
-14
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+16
-11
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+16
-14
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+15
-10
include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
...k/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
+6
-30
include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
+383
-0
include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
...nsor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
+7
-4
include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
...r_operation/gpu/thread/reduction_functions_threadwise.hpp
+19
-42
include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
...tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
+6
-2
include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp
.../tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp
+3
-0
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
...nsor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
+7
-6
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
...operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+8
-7
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+8
-7
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
+3
-0
No files found.
include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_descriptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "thread_group_tensor_slice_transfer_v6r1.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "gridwise_gemm_pipeline_v1.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
#include "reduction_functions_threadwise.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_descriptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "thread_group_tensor_slice_transfer_v6r1.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "gridwise_gemm_pipeline_v1.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "ck/utility/common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
#include "thread_group_tensor_slice_transfer_v6r1.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
#include "gridwise_gemm_pipeline_v1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "ck/utility/common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "gridwise_gemm_pipeline_v1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
View file @
cba8f7f2
#ifndef CK_GRIDWISE_GEMM_XDLOPS_V2R4_HPP
// SPDX-License-Identifier: MIT
#define CK_GRIDWISE_GEMM_XDLOPS_V2R4_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "multi_index_transform_helper.hpp"
#include "tensor_descriptor.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -607,7 +611,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
...
@@ -607,7 +611,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
c_grid_buf
);
c_grid_buf
);
}
}
}
}
};
// namespace ck
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
View file @
cba8f7f2
#ifndef CK_GRIDWISE_GEMM_XDLOPS_V2R4R2_HPP
// SPDX-License-Identifier: MIT
#define CK_GRIDWISE_GEMM_XDLOPS_V2R4R2_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "multi_index_transform_helper.hpp"
#include "tensor_descriptor.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "thread_group_tensor_slice_transfer_v6r1.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -717,7 +721,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
...
@@ -717,7 +721,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
});
});
}
}
}
}
};
// namespace ck
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_descriptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
#include "tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "thread_group_tensor_slice_transfer_v6r1.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
#include "gridwise_gemm_pipeline_v1.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "tensor_space_filling_curve.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
View file @
cba8f7f2
#ifndef CK_GRIDWISE_GEMM_XDLOPS_V3R2_HPP
// SPDX-License-Identifier: MIT
#define CK_GRIDWISE_GEMM_XDLOPS_V3R2_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "multi_index_transform_helper.hpp"
#include "tensor_descriptor.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "thread_group_tensor_slice_transfer_v6r2.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
#include "gridwise_gemm_pipeline_v1.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -755,4 +758,3 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
...
@@ -755,4 +758,3 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_descriptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "thread_group_tensor_slice_transfer_v6r3.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "gridwise_gemm_pipeline_v1.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
View file @
cba8f7f2
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
#pragma once
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_SET_BUFFER_VALUE_HPP
#define CK_GRIDWISE_SET_BUFFER_VALUE_HPP
#include "threadwise_tensor_slice_transfer.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -77,4 +54,3 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe
...
@@ -77,4 +54,3 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
0 → 100644
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_common.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
template
<
typename
GridwiseReduction
,
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
GridDesc_M_K
>
__global__
void
kernel_softmax
(
const
GridDesc_M_K
in_grid_desc_m_k
,
const
GridDesc_M_K
out_grid_desc_m_k
,
index_t
block_group_size
,
index_t
num_k_block_tile_iteration
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_value_global
,
AccDataType
beta
,
OutDataType
*
const
__restrict__
p_out_value_global
)
{
GridwiseReduction
::
Run
(
in_grid_desc_m_k
,
out_grid_desc_m_k
,
block_group_size
,
num_k_block_tile_iteration
,
alpha
,
p_in_value_global
,
beta
,
p_out_value_global
);
};
template
<
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
GridDesc_M_K
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
GridwiseSoftmax_mk_to_mk
{
static_assert
(((
InSrcVectorDim
==
0
&&
MThreadSliceSize
%
InSrcVectorSize
==
0
)
||
(
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
))
&&
(
KThreadSliceSize
%
OutDstVectorSize
==
0
),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
static
constexpr
bool
reorder_thread_cluster
=
(
InSrcVectorDim
==
0
);
using
ThreadClusterLengths_M_K
=
Sequence
<
MThreadClusterSize
,
KThreadClusterSize
>
;
using
ThreadBufferDimAccessOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
using
ThreadClusterArrangeOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
using
ThreadReduceSrcDesc_M_K
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{})));
using
ThreadReduceDstDesc_M
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{})));
using
BlockwiseMaxReduce
=
PartitionedBlockwiseReduction
<
AccDataType
,
BlockSize
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
reduce
::
Max
,
false
>
;
// PropagateNan
using
ThreadwiseMaxReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
reduce
::
Max
,
false
>
;
// PropagateNan
using
PassThroughOp
=
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
__device__
static
void
Run
(
const
GridDesc_M_K
&
in_grid_desc_m_k
,
const
GridDesc_M_K
&
out_grid_desc_m_k
,
index_t
block_group_size
,
index_t
num_k_block_tile_iteration
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_value_global
,
AccDataType
beta
,
OutDataType
*
const
__restrict__
p_out_value_global
)
{
// LDS
__shared__
AccDataType
p_reduce_work_buffer
[
BlockSize
];
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_value_global
,
out_grid_desc_m_k
.
GetElementSpaceSize
());
auto
reduce_work_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_work_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
out_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
max_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
max_value_buf
(
I
)
=
reduce
::
Max
::
template
GetIdentityValue
<
AccDataType
>();
});
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
reduce
::
Add
::
template
GetIdentityValue
<
AccDataType
>();
});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
blkgroup_id
=
block_global_id
/
block_group_size
;
const
index_t
block_local_id
=
block_global_id
%
block_group_size
;
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
thread_local_id
));
const
auto
thread_m_cluster_id
=
thread_cluster_idx
[
I0
];
const
auto
thread_k_cluster_id
=
thread_cluster_idx
[
I1
];
const
index_t
reduceSizePerBlock
=
K_BlockTileSize
*
num_k_block_tile_iteration
;
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
GridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
blkgroup_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
block_local_id
*
reduceSizePerBlock
+
thread_k_cluster_id
*
KThreadSliceSize
));
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
OutDataType
,
AccDataType
,
GridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
out_grid_desc_m_k
,
make_multi_index
(
blkgroup_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
block_local_id
*
reduceSizePerBlock
+
thread_k_cluster_id
*
KThreadSliceSize
));
auto
threadwise_dst_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
OutDataType
,
decltype
(
thread_buffer_desc
),
GridDesc_M_K
,
PassThroughOp
,
ThreadBufferLengths
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
OutDstVectorSize
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
(
out_grid_desc_m_k
,
make_multi_index
(
blkgroup_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
block_local_id
*
reduceSizePerBlock
+
thread_k_cluster_id
*
KThreadSliceSize
),
PassThroughOp
{});
constexpr
auto
in_thread_copy_fwd_step
=
make_multi_index
(
0
,
K_BlockTileSize
);
constexpr
auto
in_thread_copy_bwd_step
=
make_multi_index
(
0
,
-
K_BlockTileSize
);
///
/// max(x)
///
const
auto
in_global_val_buf_oob_non_zero
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
reduce
::
Max
::
template
GetIdentityValue
<
InDataType
>());
index_t
reducedTiles
=
0
;
do
{
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_val_buf_oob_non_zero
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
ThreadwiseMaxReduce
::
Reduce
(
in_thread_buf
,
max_value_buf
);
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_fwd_step
);
reducedTiles
++
;
}
while
(
reducedTiles
<
num_k_block_tile_iteration
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}(
[
&
](
auto
I
)
{
BlockwiseMaxReduce
::
Reduce
(
reduce_work_buf
,
max_value_buf
(
I
));
});
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_bwd_step
);
///
/// sum(exp(x - max(x)))
///
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
reduce
::
Add
::
template
GetIdentityValue
<
AccDataType
>();
});
// Normally, 0 as invalid element value is adequate since 0 makes no contribution to
// accumulated result. However, in stable softmax, all values 0s or not are subtracted by
// another value_max. As numbers become non-zero, effectively it allows invalid values to
// slip through and contribute to the accumulated result.
//
// The trick here is leveraging the fact that many math functions (add, sub, exp, ...)
// propagate NaNs when operands have NaNs involved. By initialiing invalid element value
// with NaN, an invalid value doing math manipulations is still NaN, which in turn can still
// be identified as an invalid value. We can then discard the invalid values which
// originally failed the bound check during accumulation. This allows to ignore values that
// failed bound check even after multiple math manipulations.
const
auto
in_global_val_buf_oob_nan
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
NumericLimits
<
InDataType
>::
QuietNaN
());
using
BlockwiseSumReduce
=
PartitionedBlockwiseReduction
<
AccDataType
,
BlockSize
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
reduce
::
Add
,
false
,
// ignored
detail
::
AccumulateWithNanIgnore
<
reduce
::
Add
,
AccDataType
>>
;
using
ThreadwiseSumReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
reduce
::
Add
,
false
,
// ignored
detail
::
AccumulateWithNanIgnore
<
reduce
::
Add
,
AccDataType
>>
;
reducedTiles
=
0
;
do
{
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_val_buf_oob_nan
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
// do element-wise pre-reduction operation
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
in_thread_buf
(
Number
<
offset
>
{})
=
math
::
exp
(
in_thread_buf
(
Number
<
offset
>
{})
-
max_value_buf
(
iM
));
});
});
ThreadwiseSumReduce
::
Reduce
(
in_thread_buf
,
accu_value_buf
);
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_bwd_step
);
reducedTiles
++
;
}
while
(
reducedTiles
<
num_k_block_tile_iteration
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
BlockwiseSumReduce
::
Reduce
(
reduce_work_buf
,
accu_value_buf
(
I
));
// block_sync_lds();
});
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_fwd_step
);
///
/// softmax
///
reducedTiles
=
0
;
if
(
float_equal_zero
{}(
beta
))
{
do
{
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_val_buf_oob_nan
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
// out = alpha * exp(x - max(x)) / sum(exp(x - max(x)))
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
out_thread_buf
(
Number
<
offset
>
{})
=
alpha
*
math
::
exp
(
in_thread_buf
(
Number
<
offset
>
{})
-
max_value_buf
(
iM
))
/
accu_value_buf
(
iM
);
});
});
threadwise_dst_store
.
Run
(
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
out_thread_buf
,
out_grid_desc_m_k
,
out_global_val_buf
);
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_fwd_step
);
threadwise_dst_store
.
MoveDstSliceWindow
(
out_grid_desc_m_k
,
in_thread_copy_fwd_step
);
reducedTiles
++
;
}
while
(
reducedTiles
<
num_k_block_tile_iteration
);
}
else
{
do
{
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_val_buf_oob_nan
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
threadwise_dst_load
.
Run
(
out_grid_desc_m_k
,
out_global_val_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
out_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
// out = alpha * exp(x - max(x)) / sum(exp(x - max(x))) + beta * prior_out
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
out_thread_buf
(
Number
<
offset
>
{})
=
alpha
*
math
::
exp
(
in_thread_buf
(
Number
<
offset
>
{})
-
max_value_buf
(
iM
))
/
accu_value_buf
(
iM
)
+
beta
*
out_thread_buf
(
Number
<
offset
>
{});
});
});
threadwise_dst_store
.
Run
(
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
out_thread_buf
,
out_grid_desc_m_k
,
out_global_val_buf
);
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_fwd_step
);
threadwise_dst_store
.
MoveDstSliceWindow
(
out_grid_desc_m_k
,
in_thread_copy_fwd_step
);
threadwise_dst_load
.
MoveSrcSliceWindow
(
out_grid_desc_m_k
,
in_thread_copy_fwd_step
);
reducedTiles
++
;
}
while
(
reducedTiles
<
num_k_block_tile_iteration
);
}
}
};
}
// namespace ck
include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "c
luster_descriptor
.hpp"
#include "c
k/utility/data_type
.hpp"
#include "
data_type
.hpp"
#include "
ck/tensor_description/cluster_descriptor
.hpp"
#include "
element_wise_operation
.hpp"
#include "
ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer
.hpp"
#include "
threadwise_tensor_slice_transfer
.hpp"
#include "
ck/tensor_operation/gpu/element/element_wise_operation
.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
View file @
cba8f7f2
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
#pragma once
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
#include "ck/utility/reduction_functions_accumulate.hpp"
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
#include "reduction_functions_accumulate.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -39,7 +16,9 @@ template <typename AccDataType,
...
@@ -39,7 +16,9 @@ template <typename AccDataType,
typename
SrcThreadDesc_M_K
,
typename
SrcThreadDesc_M_K
,
typename
DstThreadDesc_M
,
typename
DstThreadDesc_M
,
typename
OpReduce
,
typename
OpReduce
,
bool
PropagateNan
>
bool
PropagateNan
,
typename
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
>
>
struct
ThreadwiseReduction
struct
ThreadwiseReduction
{
{
static
constexpr
auto
src_thread_desc_m_k
=
SrcThreadDesc_M_K
{};
static
constexpr
auto
src_thread_desc_m_k
=
SrcThreadDesc_M_K
{};
...
@@ -74,12 +53,15 @@ struct ThreadwiseReduction
...
@@ -74,12 +53,15 @@ struct ThreadwiseReduction
// 2) DstDesc is known at compile-time
// 2) DstDesc is known at compile-time
// 3) SrcBuffer is static buffer
// 3) SrcBuffer is static buffer
// 4) DstBuffer is static buffer
// 4) DstBuffer is static buffer
template
<
typename
AccDataType
,
template
<
typename
IndexDataType
,
typename
AccDataType
,
typename
SrcThreadDesc_M_K
,
typename
IndexDataType
,
typename
DstThreadDesc_M
,
typename
SrcThreadDesc_M_K
,
typename
OpReduce
,
typename
DstThreadDesc_M
,
bool
PropagateNan
>
typename
OpReduce
,
bool
PropagateNan
,
typename
Accumulation
=
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
,
IndexDataType
>
>
struct
ThreadwiseReductionWithIndex
struct
ThreadwiseReductionWithIndex
{
{
static
constexpr
auto
src_thread_desc_m_k
=
SrcThreadDesc_M_K
{};
static
constexpr
auto
src_thread_desc_m_k
=
SrcThreadDesc_M_K
{};
...
@@ -91,9 +73,6 @@ struct ThreadwiseReductionWithIndex
...
@@ -91,9 +73,6 @@ struct ThreadwiseReductionWithIndex
static_assert
(
src_length_m
==
dst_length_m
,
"lengths of source and dst buffer must match!"
);
static_assert
(
src_length_m
==
dst_length_m
,
"lengths of source and dst buffer must match!"
);
using
Accumulation
=
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
,
IndexDataType
>
;
template
<
typename
SrcValueBufferType
,
template
<
typename
SrcValueBufferType
,
typename
SrcIndexBufferType
,
typename
SrcIndexBufferType
,
typename
DstValueBufferType
,
typename
DstValueBufferType
,
...
@@ -118,6 +97,4 @@ struct ThreadwiseReductionWithIndex
...
@@ -118,6 +97,4 @@ struct ThreadwiseReductionWithIndex
};
};
};
};
};
// end of namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "math.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/utility/math.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_THREADWISE_GEMM_DLOPS_V3_HPP
#ifndef CK_THREADWISE_GEMM_DLOPS_V3_HPP
#define CK_THREADWISE_GEMM_DLOPS_V3_HPP
#define CK_THREADWISE_GEMM_DLOPS_V3_HPP
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
View file @
cba8f7f2
#ifndef CK_THREADWISE_TENSOR_SET_HPP
// SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SET_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -56,4 +58,3 @@ struct ThreadwiseTensorSliceSet_v1
...
@@ -56,4 +58,3 @@ struct ThreadwiseTensorSliceSet_v1
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
View file @
cba8f7f2
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP
// SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_space_filling_curve.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -1168,4 +1170,3 @@ struct ThreadwiseTensorSliceTransfer_v4
...
@@ -1168,4 +1170,3 @@ struct ThreadwiseTensorSliceTransfer_v4
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
View file @
cba8f7f2
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R1_HPP
// SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R1_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp"
#pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "static_tensor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor/static_tensor.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -789,4 +791,3 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -789,4 +791,3 @@ struct ThreadwiseTensorSliceTransfer_v3r1
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
...
...
Prev
1
…
5
6
7
8
9
10
11
12
13
…
30
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment