Unverified Commit d480a5a6 authored by Max Podkorytov's avatar Max Podkorytov Committed by GitHub
Browse files

Merge branch 'develop' into ck-flex

parents bca939ce 9c5b2f39
...@@ -77,8 +77,8 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep ...@@ -77,8 +77,8 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
index_t grid_size = 0; index_t grid_size = 0;
for(const auto& it_desc : gemm_descs) for(const auto& it_desc : gemm_descs)
{ {
const auto dim3 = TilePartitioner::GridSize(it_desc.M, it_desc.N); const auto local_grid_size = TilePartitioner::GridSize(it_desc.M, it_desc.N);
grid_size += dim3.x * dim3.y * 1; grid_size += local_grid_size * it_desc.k_batch;
} }
return dim3(grid_size, 1, 1); return dim3(grid_size, 1, 1);
} }
...@@ -106,8 +106,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep ...@@ -106,8 +106,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
const index_t stride_b = gemm_descs[i].stride_B; const index_t stride_b = gemm_descs[i].stride_B;
const index_t stride_c = gemm_descs[i].stride_C; const index_t stride_c = gemm_descs[i].stride_C;
const auto dim3 = TilePartitioner::GridSize(M, N); const index_t grid_size_grp = TilePartitioner::GridSize(M, N) * gemm_descs[i].k_batch;
const index_t grid_size_grp = dim3.x;
const index_t block_start = grid_size; const index_t block_start = grid_size;
const index_t block_end = grid_size + grid_size_grp; const index_t block_end = grid_size + grid_size_grp;
...@@ -138,8 +137,8 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep ...@@ -138,8 +137,8 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
CK_TILE_DEVICE void Run(const GemmTransKernelArg& kargs) const CK_TILE_DEVICE void Run(const GemmTransKernelArg& kargs) const
{ {
const auto [iM, iN] = const auto [iM, iN] = OffsetTile1DPartitioner::GetOffsetedTileIndex(
OffsetTile1DPartitioner::GetOffsetedTileIndex(kargs.block_start, kargs.group_karg.N); kargs.block_start, kargs.group_karg.M, kargs.group_karg.N);
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
......
...@@ -21,6 +21,8 @@ struct GemmPipelineAgBgCrImplBase ...@@ -21,6 +21,8 @@ struct GemmPipelineAgBgCrImplBase
static constexpr index_t NPerBlock = BlockGemmShape::kN; static constexpr index_t NPerBlock = BlockGemmShape::kN;
static constexpr index_t KPerBlock = BlockGemmShape::kK; static constexpr index_t KPerBlock = BlockGemmShape::kK;
CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
template <typename DstBlockTile, typename SrcTileWindow, typename DramTileWindowStep> template <typename DstBlockTile, typename SrcTileWindow, typename DramTileWindowStep>
CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile, CK_TILE_DEVICE void GlobalPrefetch(DstBlockTile& dst_block_tile,
SrcTileWindow& dram_tile_window, SrcTileWindow& dram_tile_window,
......
...@@ -20,6 +20,8 @@ struct BaseGemmPipelineAgBgCrCompV3 ...@@ -20,6 +20,8 @@ struct BaseGemmPipelineAgBgCrCompV3
static constexpr index_t PrefillStages = 1; static constexpr index_t PrefillStages = 1;
static constexpr index_t GlobalBufferNum = 1; static constexpr index_t GlobalBufferNum = 1;
CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop) CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop)
{ {
return num_loop > PrefetchStages; return num_loop > PrefetchStages;
...@@ -62,9 +64,9 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem> ...@@ -62,9 +64,9 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
static constexpr index_t NPerBlock = BlockGemmShape::kN; static constexpr index_t NPerBlock = BlockGemmShape::kN;
static constexpr index_t KPerBlock = BlockGemmShape::kK; static constexpr index_t KPerBlock = BlockGemmShape::kK;
static constexpr index_t VectorSizeA = Policy::template GetVectorSizeA<Problem>(); static constexpr index_t GetVectorSizeA() { return Policy::template GetVectorSizeA<Problem>(); }
static constexpr index_t VectorSizeB = Policy::template GetVectorSizeB<Problem>(); static constexpr index_t GetVectorSizeB() { return Policy::template GetVectorSizeB<Problem>(); }
static constexpr index_t VectorSizeC = Policy::template GetVectorSizeC<Problem>(); static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC<Problem>(); }
static constexpr bool kPadM = Problem::kPadM; static constexpr bool kPadM = Problem::kPadM;
static constexpr bool kPadN = Problem::kPadN; static constexpr bool kPadN = Problem::kPadN;
...@@ -81,11 +83,6 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem> ...@@ -81,11 +83,6 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
return Policy::template GetSmemSize<Problem>(); return Policy::template GetSmemSize<Problem>();
} }
CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC()
{
return Policy::template IsTransposeC<Problem>();
}
template <GemmPipelineScheduler Scheduler> template <GemmPipelineScheduler Scheduler>
struct PipelineImpl : public PipelineImplBase struct PipelineImpl : public PipelineImplBase
{ {
...@@ -110,9 +107,9 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem> ...@@ -110,9 +107,9 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
constexpr index_t B_LDS_Read_Width = KPerXDL; constexpr index_t B_LDS_Read_Width = KPerXDL;
constexpr index_t A_Buffer_Load_Inst_Num = constexpr index_t A_Buffer_Load_Inst_Num =
MPerBlock * KPerBlock / (BlockSize * VectorSizeA); MPerBlock * KPerBlock / (BlockSize * GetVectorSizeA());
constexpr index_t B_Buffer_Load_Inst_Num = constexpr index_t B_Buffer_Load_Inst_Num =
NPerBlock * KPerBlock / (BlockSize * VectorSizeB); NPerBlock * KPerBlock / (BlockSize * GetVectorSizeB());
constexpr index_t A_LDS_Write_Inst_Num = MPerBlock * KPerBlock / (BlockSize * KPerXDL); constexpr index_t A_LDS_Write_Inst_Num = MPerBlock * KPerBlock / (BlockSize * KPerXDL);
constexpr index_t B_LDS_Write_Inst_Num = NPerBlock * KPerBlock / (BlockSize * KPerXDL); constexpr index_t B_LDS_Write_Inst_Num = NPerBlock * KPerBlock / (BlockSize * KPerXDL);
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -20,6 +20,8 @@ struct BaseGemmPipelineAgBgCrMem ...@@ -20,6 +20,8 @@ struct BaseGemmPipelineAgBgCrMem
using BDataType = remove_cvref_t<typename Problem::BDataType>; using BDataType = remove_cvref_t<typename Problem::BDataType>;
using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>; using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
static constexpr index_t BlockSize = Problem::kBlockSize; static constexpr index_t BlockSize = Problem::kBlockSize;
static constexpr index_t MPerBlock = BlockGemmShape::kM; static constexpr index_t MPerBlock = BlockGemmShape::kM;
static constexpr index_t NPerBlock = BlockGemmShape::kN; static constexpr index_t NPerBlock = BlockGemmShape::kN;
...@@ -113,9 +115,9 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem> ...@@ -113,9 +115,9 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
static constexpr index_t NPerBlock = BlockGemmShape::kN; static constexpr index_t NPerBlock = BlockGemmShape::kN;
static constexpr index_t KPerBlock = BlockGemmShape::kK; static constexpr index_t KPerBlock = BlockGemmShape::kK;
static constexpr index_t VectorSizeA = Policy::template GetVectorSizeA<Problem>(); static constexpr index_t GetVectorSizeA() { return Policy::template GetVectorSizeA<Problem>(); }
static constexpr index_t VectorSizeB = Policy::template GetVectorSizeB<Problem>(); static constexpr index_t GetVectorSizeB() { return Policy::template GetVectorSizeB<Problem>(); }
static constexpr index_t VectorSizeC = Policy::template GetVectorSizeC<Problem>(); static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC<Problem>(); }
static constexpr bool kPadM = Problem::kPadM; static constexpr bool kPadM = Problem::kPadM;
static constexpr bool kPadN = Problem::kPadN; static constexpr bool kPadN = Problem::kPadN;
...@@ -133,11 +135,6 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem> ...@@ -133,11 +135,6 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
return Policy::template GetSmemSize<Problem>(); return Policy::template GetSmemSize<Problem>();
} }
CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC()
{
return Policy::template IsTransposeC<Problem>();
}
template <GemmPipelineScheduler Scheduler> template <GemmPipelineScheduler Scheduler>
struct PipelineImpl : public PipelineImplBase struct PipelineImpl : public PipelineImplBase
{ {
......
...@@ -31,21 +31,21 @@ struct GemmPipelineAGmemBGmemCRegV1 ...@@ -31,21 +31,21 @@ struct GemmPipelineAGmemBGmemCRegV1
static constexpr index_t kNPerBlock = BlockGemmShape::kN; static constexpr index_t kNPerBlock = BlockGemmShape::kN;
static constexpr index_t kKPerBlock = BlockGemmShape::kK; static constexpr index_t kKPerBlock = BlockGemmShape::kK;
static constexpr index_t VectorSizeA = Problem::VectorSizeA; static constexpr index_t GetVectorSizeA() { return Problem::VectorSizeA; }
static constexpr index_t VectorSizeB = Problem::VectorSizeB; static constexpr index_t GetVectorSizeB() { return Problem::VectorSizeB; }
static constexpr index_t VectorSizeC = Problem::VectorSizeC; static constexpr index_t GetVectorSizeC() { return Problem::VectorSizeC; }
static constexpr bool kPadM = Problem::kPadM; static constexpr bool kPadM = Problem::kPadM;
static constexpr bool kPadN = Problem::kPadN; static constexpr bool kPadN = Problem::kPadN;
static constexpr bool kPadK = Problem::kPadK; static constexpr bool kPadK = Problem::kPadK;
CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
{ {
return Policy::template GetSmemSize<Problem>(); return Policy::template GetSmemSize<Problem>();
} }
CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return Policy::IsTransposeC(); }
template <typename ADramBlockWindowTmp, template <typename ADramBlockWindowTmp,
typename BDramBlockWindowTmp, typename BDramBlockWindowTmp,
typename AElementFunction, typename AElementFunction,
......
...@@ -16,8 +16,6 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy ...@@ -16,8 +16,6 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
static constexpr auto I1 = number<1>{}; static constexpr auto I1 = number<1>{};
static constexpr auto I2 = number<2>{}; static constexpr auto I2 = number<2>{};
static constexpr bool TransposeC = true;
// 3d + padding // 3d + padding
template <typename Problem> template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor() CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
...@@ -383,8 +381,6 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy ...@@ -383,8 +381,6 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
} }
} }
CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return TransposeC; }
template <typename Problem> template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm() CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
{ {
...@@ -397,7 +393,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy ...@@ -397,7 +393,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
WarpTile::at(I0), WarpTile::at(I0),
WarpTile::at(I1), WarpTile::at(I1),
WarpTile::at(I2), WarpTile::at(I2),
TransposeC>; Problem::TransposeC>;
using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType, using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
typename Problem::BDataType, typename Problem::BDataType,
typename Problem::CDataType, typename Problem::CDataType,
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -25,6 +25,8 @@ struct GemmPipelineAGmemBGmemCRegV2 ...@@ -25,6 +25,8 @@ struct GemmPipelineAGmemBGmemCRegV2
static constexpr index_t kNPerBlock = BlockGemmShape::kN; static constexpr index_t kNPerBlock = BlockGemmShape::kN;
static constexpr index_t kKPerBlock = BlockGemmShape::kK; static constexpr index_t kKPerBlock = BlockGemmShape::kK;
CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
CK_TILE_HOST_DEVICE static constexpr index_t GetStaticLdsSize() CK_TILE_HOST_DEVICE static constexpr index_t GetStaticLdsSize()
{ {
return integer_divide_ceil( return integer_divide_ceil(
...@@ -36,8 +38,6 @@ struct GemmPipelineAGmemBGmemCRegV2 ...@@ -36,8 +38,6 @@ struct GemmPipelineAGmemBGmemCRegV2
Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size(); Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
} }
CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return Policy::IsTransposeC(); }
template <typename ADramBlockWindowTmp, template <typename ADramBlockWindowTmp,
typename BDramBlockWindowTmp, typename BDramBlockWindowTmp,
typename AElementFunction, typename AElementFunction,
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -27,6 +27,8 @@ struct GemmPipelineProblemBase ...@@ -27,6 +27,8 @@ struct GemmPipelineProblemBase
using BLayout = remove_cvref_t<typename Traits::BLayout>; using BLayout = remove_cvref_t<typename Traits::BLayout>;
using CLayout = remove_cvref_t<typename Traits::CLayout>; using CLayout = remove_cvref_t<typename Traits::CLayout>;
static constexpr bool TransposeC = Traits::TransposeC;
static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size(); static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();
static constexpr bool kPadM = Traits::kPadM; static constexpr bool kPadM = Traits::kPadM;
...@@ -111,7 +113,6 @@ struct GemmPipelineProblemBase ...@@ -111,7 +113,6 @@ struct GemmPipelineProblemBase
return kPadK ? 1 : GetAlignmentB(); return kPadK ? 1 : GetAlignmentB();
} }
}(); }();
static constexpr index_t VectorSizeC = []() { static constexpr index_t VectorSizeC = []() {
if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>) if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
{ {
......
...@@ -549,12 +549,6 @@ struct UniversalGemmPipelineAgBgCrPolicy ...@@ -549,12 +549,6 @@ struct UniversalGemmPipelineAgBgCrPolicy
return TileEncodingPattern::MakeShuffled2DStaticTileDistribution(); return TileEncodingPattern::MakeShuffled2DStaticTileDistribution();
} }
template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC()
{
return Problem::TransposeC;
}
template <typename Problem> template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm() CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
{ {
......
...@@ -285,9 +285,14 @@ ENDFOREACH() ...@@ -285,9 +285,14 @@ ENDFOREACH()
if(CK_DEVICE_OTHER_INSTANCES) if(CK_DEVICE_OTHER_INSTANCES)
add_library(device_other_operations STATIC ${CK_DEVICE_OTHER_INSTANCES}) add_library(device_other_operations ${CK_DEVICE_OTHER_INSTANCES})
add_library(composablekernels::device_other_operations ALIAS device_other_operations) add_library(composablekernels::device_other_operations ALIAS device_other_operations)
set_target_properties(device_other_operations PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_other_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(device_other_operations
PROPERTIES
VERSION ${CMAKE_PROJECT_VERSION}
SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
)
target_include_directories(device_other_operations PUBLIC target_include_directories(device_other_operations PUBLIC
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>
...@@ -316,10 +321,15 @@ if(CK_DEVICE_OTHER_INSTANCES) ...@@ -316,10 +321,15 @@ if(CK_DEVICE_OTHER_INSTANCES)
) )
endif() endif()
if(CK_DEVICE_GEMM_INSTANCES) if(CK_DEVICE_GEMM_INSTANCES)
add_library(device_gemm_operations STATIC ${CK_DEVICE_GEMM_INSTANCES}) add_library(device_gemm_operations ${CK_DEVICE_GEMM_INSTANCES})
add_library(composablekernels::device_gemm_operations ALIAS device_gemm_operations) add_library(composablekernels::device_gemm_operations ALIAS device_gemm_operations)
target_compile_features(device_gemm_operations PUBLIC) target_compile_features(device_gemm_operations PUBLIC)
set_target_properties(device_gemm_operations PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_gemm_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(device_gemm_operations
PROPERTIES
VERSION ${CMAKE_PROJECT_VERSION}
SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
)
target_include_directories(device_gemm_operations PUBLIC target_include_directories(device_gemm_operations PUBLIC
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu>
) )
...@@ -332,10 +342,15 @@ if(CK_DEVICE_GEMM_INSTANCES) ...@@ -332,10 +342,15 @@ if(CK_DEVICE_GEMM_INSTANCES)
) )
endif() endif()
if(CK_DEVICE_CONV_INSTANCES) if(CK_DEVICE_CONV_INSTANCES)
add_library(device_conv_operations STATIC ${CK_DEVICE_CONV_INSTANCES}) add_library(device_conv_operations ${CK_DEVICE_CONV_INSTANCES})
add_library(composablekernels::device_conv_operations ALIAS device_conv_operations) add_library(composablekernels::device_conv_operations ALIAS device_conv_operations)
target_compile_features(device_conv_operations PUBLIC) target_compile_features(device_conv_operations PUBLIC)
set_target_properties(device_conv_operations PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_conv_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(device_conv_operations
PROPERTIES
VERSION ${CMAKE_PROJECT_VERSION}
SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
)
target_include_directories(device_conv_operations PUBLIC target_include_directories(device_conv_operations PUBLIC
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange>
...@@ -354,7 +369,12 @@ endif() ...@@ -354,7 +369,12 @@ endif()
if(CK_DEVICE_MHA_INSTANCES) if(CK_DEVICE_MHA_INSTANCES)
set(gpu_list ${INST_TARGETS}) set(gpu_list ${INST_TARGETS})
if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a") if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a")
add_library(device_mha_operations STATIC ${CK_DEVICE_MHA_INSTANCES}) add_library(device_mha_operations ${CK_DEVICE_MHA_INSTANCES})
set_target_properties(device_mha_operations
PROPERTIES
VERSION ${CMAKE_PROJECT_VERSION}
SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
)
add_library(composablekernels::device_mha_operations ALIAS device_mha_operations) add_library(composablekernels::device_mha_operations ALIAS device_mha_operations)
target_compile_features(device_mha_operations PUBLIC) target_compile_features(device_mha_operations PUBLIC)
set_target_properties(device_mha_operations PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_mha_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
...@@ -369,10 +389,15 @@ if(CK_DEVICE_MHA_INSTANCES) ...@@ -369,10 +389,15 @@ if(CK_DEVICE_MHA_INSTANCES)
endif() endif()
endif() endif()
if(CK_DEVICE_CONTRACTION_INSTANCES) if(CK_DEVICE_CONTRACTION_INSTANCES)
add_library(device_contraction_operations STATIC ${CK_DEVICE_CONTRACTION_INSTANCES}) add_library(device_contraction_operations ${CK_DEVICE_CONTRACTION_INSTANCES})
add_library(composablekernels::device_contraction_operations ALIAS device_contraction_operations) add_library(composablekernels::device_contraction_operations ALIAS device_contraction_operations)
target_compile_features(device_contraction_operations PUBLIC) target_compile_features(device_contraction_operations PUBLIC)
set_target_properties(device_contraction_operations PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_contraction_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(device_contraction_operations
PROPERTIES
VERSION ${CMAKE_PROJECT_VERSION}
SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
)
target_include_directories(device_contraction_operations PUBLIC target_include_directories(device_contraction_operations PUBLIC
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/contraction> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/contraction>
...@@ -386,10 +411,15 @@ if(CK_DEVICE_CONTRACTION_INSTANCES) ...@@ -386,10 +411,15 @@ if(CK_DEVICE_CONTRACTION_INSTANCES)
) )
endif() endif()
if(CK_DEVICE_REDUCTION_INSTANCES) if(CK_DEVICE_REDUCTION_INSTANCES)
add_library(device_reduction_operations STATIC ${CK_DEVICE_REDUCTION_INSTANCES}) add_library(device_reduction_operations ${CK_DEVICE_REDUCTION_INSTANCES})
add_library(composablekernels::device_reduction_operations ALIAS device_reduction_operations) add_library(composablekernels::device_reduction_operations ALIAS device_reduction_operations)
target_compile_features(device_reduction_operations PUBLIC) target_compile_features(device_reduction_operations PUBLIC)
set_target_properties(device_reduction_operations PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_reduction_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(device_reduction_operations
PROPERTIES
VERSION ${CMAKE_PROJECT_VERSION}
SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
)
target_include_directories(device_reduction_operations PUBLIC target_include_directories(device_reduction_operations PUBLIC
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/reduce> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/reduce>
) )
......
add_library(utility STATIC add_library(utility
device_memory.cpp device_memory.cpp
host_tensor.cpp host_tensor.cpp
convolution_parameter.cpp convolution_parameter.cpp
...@@ -6,6 +6,10 @@ add_library(utility STATIC ...@@ -6,6 +6,10 @@ add_library(utility STATIC
add_library(composable_kernel::utility ALIAS utility) add_library(composable_kernel::utility ALIAS utility)
set_target_properties(utility PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(utility PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(utility
PROPERTIES
VERSION ${CMAKE_PROJECT_VERSION}
SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR})
target_compile_options(utility PRIVATE ${CMAKE_COMPILER_WARNINGS}) target_compile_options(utility PRIVATE ${CMAKE_COMPILER_WARNINGS})
target_include_directories(utility PUBLIC target_include_directories(utility PUBLIC
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>" "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>"
......
...@@ -32,9 +32,6 @@ class TestCkTileBatchedGemm : public ::testing::Test ...@@ -32,9 +32,6 @@ class TestCkTileBatchedGemm : public ::testing::Test
constexpr bool kPadM = false; constexpr bool kPadM = false;
constexpr bool kPadN = false; constexpr bool kPadN = false;
constexpr bool kPadK = false; constexpr bool kPadK = false;
constexpr bool kTilePermute = false;
// The rank and permutation will also be generate out by the CodeGen part.
constexpr ck_tile::index_t kOutputRank = 2;
constexpr int kBlockPerCu = 1; constexpr int kBlockPerCu = 1;
...@@ -51,32 +48,12 @@ class TestCkTileBatchedGemm : public ::testing::Test ...@@ -51,32 +48,12 @@ class TestCkTileBatchedGemm : public ::testing::Test
constexpr ck_tile::index_t N_Warp_Tile = 32; constexpr ck_tile::index_t N_Warp_Tile = 32;
constexpr ck_tile::index_t K_Warp_Tile = 8; constexpr ck_tile::index_t K_Warp_Tile = 8;
// Whether doing the CShuffle (transpose before the global memory), depending on the output
// layout.
constexpr bool CShuffleEpilogue =
std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::ColumnMajor>;
using CodegenGemmShape = using CodegenGemmShape =
ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>, ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
ck_tile::sequence<M_Warp, N_Warp, K_Warp>, ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>; ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
using TilePartitioner = ck_tile::GemmTile2DPartitioner<CodegenGemmShape>; using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
using GemmEpilogue = std::conditional_t<
CShuffleEpilogue,
ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<AccDataType,
CDataType,
kPadM,
kPadN,
kTilePermute,
kOutputRank,
1,
0,
TilePartitioner::MPerBlock,
TilePartitioner::NPerBlock>>,
ck_tile::Default2DEpilogue<
ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>>;
using CodegenGemmTraits = using CodegenGemmTraits =
ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>; ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
...@@ -88,6 +65,20 @@ class TestCkTileBatchedGemm : public ::testing::Test ...@@ -88,6 +65,20 @@ class TestCkTileBatchedGemm : public ::testing::Test
CodegenGemmTraits>; CodegenGemmTraits>;
using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>; using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
using GemmEpilogue = ck_tile::CShuffleEpilogue<
ck_tile::CShuffleEpilogueProblem<AccDataType,
CDataType,
CLayout,
CodegenGemmPipeline::BlockSize,
TilePartitioner::MPerBlock,
TilePartitioner::NPerBlock,
M_Warp,
N_Warp,
M_Warp_Tile,
N_Warp_Tile,
K_Warp_Tile,
CodegenPipelineProblem::TransposeC>>;
using Kernel = using Kernel =
ck_tile::BatchedGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>; ck_tile::BatchedGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <sstream> #include <sstream>
...@@ -56,6 +56,8 @@ class TestCkTileGemmPipeline : public ::testing::Test ...@@ -56,6 +56,8 @@ class TestCkTileGemmPipeline : public ::testing::Test
constexpr bool TransposeC = false; constexpr bool TransposeC = false;
constexpr int kBlockPerCu = 1; constexpr int kBlockPerCu = 1;
constexpr ck_tile::index_t TileParitionerGroupNum = 8;
constexpr ck_tile::index_t TileParitionerM01 = 4;
// =============================================== // ===============================================
...@@ -63,10 +65,8 @@ class TestCkTileGemmPipeline : public ::testing::Test ...@@ -63,10 +65,8 @@ class TestCkTileGemmPipeline : public ::testing::Test
ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>, ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
ck_tile::sequence<M_Warp, N_Warp, K_Warp>, ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>; ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
using TilePartitioner = ck_tile::GemmTile2DPartitioner<GemmShape>; using TilePartitioner = ck_tile::
GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
using GemmEpilogue = ck_tile::Default2DEpilogue<
ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>;
using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>; using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
using GemmUniversalTraits = ck_tile:: using GemmUniversalTraits = ck_tile::
...@@ -106,6 +106,20 @@ class TestCkTileGemmPipeline : public ::testing::Test ...@@ -106,6 +106,20 @@ class TestCkTileGemmPipeline : public ::testing::Test
ck_tile::GemmPipelineAgBgCrCompV3<UniversalGemmProblem, ck_tile::GemmPipelineAgBgCrCompV3<UniversalGemmProblem,
ck_tile::UniversalGemmPipelineAgBgCrPolicy>>; ck_tile::UniversalGemmPipelineAgBgCrPolicy>>;
using GemmEpilogue = ck_tile::CShuffleEpilogue<
ck_tile::CShuffleEpilogueProblem<AccDataType,
CDataType,
CLayout,
GemmPipeline::BlockSize,
TilePartitioner::MPerBlock,
TilePartitioner::NPerBlock,
M_Warp,
N_Warp,
M_Warp_Tile,
N_Warp_Tile,
K_Warp_Tile,
UniversalGemmProblem::TransposeC>>;
using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>; using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
auto kargs = Kernel::MakeKernelArgs(args); auto kargs = Kernel::MakeKernelArgs(args);
...@@ -244,7 +258,7 @@ class TestCkTileGemmPipeline : public ::testing::Test ...@@ -244,7 +258,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
public: public:
std::vector<int> k_batches_; std::vector<int> k_batches_;
void SetUp() override { k_batches_ = {1}; } void SetUp() override { k_batches_ = {1, 2}; }
template <bool PadM = true, bool PadN = true, bool PadK = true> template <bool PadM = true, bool PadN = true, bool PadK = true>
void Run(const int M, void Run(const int M,
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <sstream> #include <sstream>
...@@ -29,9 +29,6 @@ class TestCkTileGroupedGemm : public ::testing::Test ...@@ -29,9 +29,6 @@ class TestCkTileGroupedGemm : public ::testing::Test
static const bool kPadM = false; static const bool kPadM = false;
static const bool kPadN = false; static const bool kPadN = false;
static const bool kPadK = false; static const bool kPadK = false;
static const bool kTilePermute = false;
static const ck_tile::index_t kOutputRank = 2;
static const int kBlockPerCu = 1; static const int kBlockPerCu = 1;
static const ck_tile::index_t M_Tile = 128; static const ck_tile::index_t M_Tile = 128;
...@@ -60,26 +57,6 @@ class TestCkTileGroupedGemm : public ::testing::Test ...@@ -60,26 +57,6 @@ class TestCkTileGroupedGemm : public ::testing::Test
using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>; using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
template <typename CLayout>
using GemmEpilogue =
std::conditional_t<std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::ColumnMajor>,
ck_tile::CShuffleEpilogue<
ck_tile::CShuffleEpilogueProblem<AccDataType,
CDataType,
GroupedGemKernelParam::kPadM,
GroupedGemKernelParam::kPadN,
GroupedGemKernelParam::kTilePermute,
GroupedGemKernelParam::kOutputRank,
1,
0,
TilePartitioner::MPerBlock,
TilePartitioner::NPerBlock>>,
ck_tile::Default2DEpilogue<
ck_tile::Default2DEpilogueProblem<AccDataType,
CDataType,
GroupedGemKernelParam::kPadM,
GroupedGemKernelParam::kPadN>>>;
template <typename ALayout, typename BLayout, typename CLayout> template <typename ALayout, typename BLayout, typename CLayout>
using CodegenGemmTraits = ck_tile::TileGemmTraits<GroupedGemKernelParam::kPadM, using CodegenGemmTraits = ck_tile::TileGemmTraits<GroupedGemKernelParam::kPadM,
GroupedGemKernelParam::kPadN, GroupedGemKernelParam::kPadN,
...@@ -100,10 +77,25 @@ class TestCkTileGroupedGemm : public ::testing::Test ...@@ -100,10 +77,25 @@ class TestCkTileGroupedGemm : public ::testing::Test
using CodegenGemmPipeline = using CodegenGemmPipeline =
ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem<ALayout, BLayout, CLayout>>; ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem<ALayout, BLayout, CLayout>>;
template <typename ALayout, typename BLayout, typename CLayout>
using GemmEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
AccDataType,
CDataType,
CLayout,
CodegenGemmPipeline<ALayout, BLayout, CLayout>::BlockSize,
TilePartitioner::MPerBlock,
TilePartitioner::NPerBlock,
GroupedGemKernelParam::M_Warp,
GroupedGemKernelParam::N_Warp,
GroupedGemKernelParam::M_Warp_Tile,
GroupedGemKernelParam::N_Warp_Tile,
GroupedGemKernelParam::K_Warp_Tile,
CodegenPipelineProblem<ALayout, BLayout, CLayout>::TransposeC>>;
template <typename ALayout, typename BLayout, typename CLayout> template <typename ALayout, typename BLayout, typename CLayout>
using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner,
CodegenGemmPipeline<ALayout, BLayout, CLayout>, CodegenGemmPipeline<ALayout, BLayout, CLayout>,
GemmEpilogue<CLayout>>; GemmEpilogue<ALayout, BLayout, CLayout>>;
using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs; using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;
std::size_t GetWorkspaceSize(const std::vector<grouped_gemm_kargs>& gemm_descs) std::size_t GetWorkspaceSize(const std::vector<grouped_gemm_kargs>& gemm_descs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment