"tests/pipelines/vscode:/vscode.git/clone" did not exist on "05a36d5c1a17d0a3dbc7a1efb83ba1bdbfbf1fb2"
Commit e7cde218 authored by Harisankar Sadasivan's avatar Harisankar Sadasivan
Browse files

changes suggested in PR review are made- removing comments and correcting copyright

parent 57a38a1d
File mode changed from 100644 to 100755
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp" #include "common.hpp"
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -117,7 +117,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -117,7 +117,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
auto f_get_default_stride = auto f_get_default_stride =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(stride == 0) if(stride == -1)
{ {
// give a chance if stride is zero, return a default packed stride // give a chance if stride is zero, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>) if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
...@@ -162,18 +162,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -162,18 +162,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
} }
#if 0
printf("B matrix:\n");
for (int in = 0; in < N; in++)
{
for (int ik = 0; ik < K; ik++)
{
printf("%02x ", *(reinterpret_cast<uint8_t*>(&b_k_n(ik,in))));
if(ik%8==7) printf("|");
}
printf("\n");
}
#endif
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -147,10 +147,8 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout ...@@ -147,10 +147,8 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock; index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split); const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
hipGetErrorString(hipMemsetAsync(arg.p_c_grid, hipGetErrorString(hipMemsetAsync(
0, arg.p_c_grid, 0, arg.M * arg.N * sizeof(CDataType), stream_config.stream_id_));
arg.M * arg.N * sizeof(CDataType),
stream_config.stream_id_)); // HS
const auto Run = [&](const auto& kernel) { const auto Run = [&](const auto& kernel) {
dim3 grid_dim; dim3 grid_dim;
if(arg.Grid_size < 0) if(arg.Grid_size < 0)
...@@ -193,25 +191,13 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout ...@@ -193,25 +191,13 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
}; };
ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>( ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
stream_config, stream_config, run_flush_cache, kernel, grid_dim, dim3(BlockSize), 0, arg_);
run_flush_cache,
kernel,
grid_dim,
// dim3(gdx, gdy, gdz),
dim3(BlockSize),
0,
arg_);
} }
else else
{ {
ave_time = launch_and_time_kernel(stream_config, ave_time = launch_and_time_kernel(
kernel, stream_config, kernel, grid_dim, dim3(BlockSize), 0, arg);
// dim3(gdx, gdy, gdz),
grid_dim,
dim3(BlockSize),
0,
arg);
} }
}; };
...@@ -477,7 +463,7 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout ...@@ -477,7 +463,7 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
BElementwiseOperation, BElementwiseOperation,
CElementwiseOperation) CElementwiseOperation)
{ {
// return Argument{p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC, KBatch};
return Argument{ return Argument{
p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC, streamk_sel, Grid_size}; // HS p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC, streamk_sel, Grid_size}; // HS
} }
......
...@@ -1461,31 +1461,27 @@ struct BlockToCTileMap_GemmStreamK_v2 ...@@ -1461,31 +1461,27 @@ struct BlockToCTileMap_GemmStreamK_v2
// check if there's enough work for DP+ stream-k // check if there's enough work for DP+ stream-k
bool bigEnough = num_tiles > grid_size; bool bigEnough = num_tiles > grid_size;
// select between 1 tile and 2 tile sk // select between stream-k strategies
uint32_t sk_tiles = 0; uint32_t sk_tiles = 0;
if(streamk_sel == 1) if(streamk_sel == 1) // 1 tile stream-k
{ {
sk_tiles = bigEnough ? (num_tiles % grid_size) : num_tiles; sk_tiles = bigEnough ? (num_tiles % grid_size) : num_tiles;
} }
else if(streamk_sel == 2) else if(streamk_sel == 2) // 2-tile stream-k
{ {
sk_tiles = bigEnough ? (grid_size + num_tiles % grid_size) : num_tiles; sk_tiles = bigEnough ? (grid_size + num_tiles % grid_size) : num_tiles;
} }
else if(streamk_sel == 3) else if(streamk_sel == 3) // 3-tile stream-k
{ {
sk_tiles = (num_tiles > (2 * grid_size)) ? (2 * grid_size + num_tiles % grid_size) sk_tiles = (num_tiles > (2 * grid_size)) ? (2 * grid_size + num_tiles % grid_size)
: num_tiles; : num_tiles;
} }
else if(streamk_sel == 4) else if(streamk_sel == 4) // 4-tile stream-k
{ {
sk_tiles = (num_tiles > (3 * grid_size)) ? (3 * grid_size + num_tiles % grid_size) sk_tiles = (num_tiles > (3 * grid_size)) ? (3 * grid_size + num_tiles % grid_size)
: num_tiles; : num_tiles;
} }
sk_num_blocks = sk_tiles; sk_num_blocks = sk_tiles;
// if(sk_tiles < sk_num_blocks)
// {
// sk_num_blocks = sk_tiles;
// }
// remaining tiles are DP tiles // remaining tiles are DP tiles
dp_tiles = bigEnough ? (num_tiles - sk_tiles) : 0; dp_tiles = bigEnough ? (num_tiles - sk_tiles) : 0;
...@@ -1508,7 +1504,6 @@ struct BlockToCTileMap_GemmStreamK_v2 ...@@ -1508,7 +1504,6 @@ struct BlockToCTileMap_GemmStreamK_v2
dp_num_blocks = dp_tiles; dp_num_blocks = dp_tiles;
dp_start_block_idx = sk_num_blocks; dp_start_block_idx = sk_num_blocks;
// dp_start_block_idx = ((sk_num_blocks + grid_size - 1) / grid_size) * grid_size;
} }
n_tiles = MDiv2(math::integer_divide_ceil(n, NPerBlock)); n_tiles = MDiv2(math::integer_divide_ceil(n, NPerBlock));
...@@ -1523,30 +1518,29 @@ struct BlockToCTileMap_GemmStreamK_v2 ...@@ -1523,30 +1518,29 @@ struct BlockToCTileMap_GemmStreamK_v2
equiv_tiles_little = MDiv(upper_little / k_iters_per_tile.get()); equiv_tiles_little = MDiv(upper_little / k_iters_per_tile.get());
} }
#if 0 if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
printf("streamk_sel=%0d,grid_size=%0d, num_tiles:%d, dp_tiles:%d, sk_tiles:%u, " {
"sk_num_blocks:%d,dp_num_blocks:%d,sk_num_big_blocks:%d, " printf("streamk_sel=%0d,grid_size=%0d, num_tiles:%d, dp_tiles:%d, sk_tiles:%u, "
"sk_total_iters:%d, dp_start_block_idx:%d, " "sk_num_blocks:%d,dp_num_blocks:%d,sk_num_big_blocks:%d, "
"k_iters_per_tile:%d, k_iters_per_big_block:%d, reduction_start_block_idx:%u, " "sk_total_iters:%d, dp_start_block_idx:%d, "
" workspace(acc float):%u\n", "k_iters_per_tile:%d, k_iters_per_big_block:%d, reduction_start_block_idx:%u, "
streamk_sel, " workspace(acc float):%u\n",
grid_size, streamk_sel,
// occupancy, grid_size,
// get_grid_dims(num_cu, occupancy).x, num_tiles,
num_tiles, dp_tiles,
dp_tiles, get_sk_tiles(),
get_sk_tiles(), sk_num_blocks,
sk_num_blocks, dp_num_blocks,
dp_num_blocks, sk_num_big_blocks,
sk_num_big_blocks, sk_total_iters,
sk_total_iters, dp_start_block_idx,
dp_start_block_idx,
k_iters_per_tile.get(),
k_iters_per_tile.get(), k_iters_per_big_block,
k_iters_per_big_block, reduction_start_block_idx,
reduction_start_block_idx, get_workspace_size(sizeof(float)));
get_workspace_size(sizeof(float))); }
#endif
} }
__host__ __device__ static constexpr index_t CalculateGridSize(index_t M, index_t N) __host__ __device__ static constexpr index_t CalculateGridSize(index_t M, index_t N)
...@@ -1656,90 +1650,6 @@ struct BlockToCTileMap_GemmStreamK_v2 ...@@ -1656,90 +1650,6 @@ struct BlockToCTileMap_GemmStreamK_v2
m_tile_idx_with_adapt = tile_idx_local % sub_m_adapt; m_tile_idx_with_adapt = tile_idx_local % sub_m_adapt;
return make_tuple(m_tile_idx_with_adapt + m_tile_idx_sub0 * tile_swizzle_sub_m, return make_tuple(m_tile_idx_with_adapt + m_tile_idx_sub0 * tile_swizzle_sub_m,
n_tile_idx_with_adapt); n_tile_idx_with_adapt);
// adding gfx94x optimized
// index_t block_1d_id = tile_idx;
// const index_t N0 = n_tiles_value;
// const index_t M0 = math::integer_divide_ceil(n * m / m, MPerBlock);
// // index_t GroupNum = 8;
// // index_t M01_ = 4;
// if(M0 == 1)
// {
// return make_tuple(0, block_1d_id);
// }
// else if(N0 == 1)
// {
// return make_tuple(block_1d_id, 0);
// }
// // block_1d_id = block_1d_id % (M0 * N0); // swallow batch index
// else
// {
// const auto group_size = math::integer_divide_ceil(M0 * N0, GroupNum);
// const auto big_group_num = GroupNum - (group_size * GroupNum - M0 * N0);
// auto group_id_x = block_1d_id % GroupNum;
// auto group_id_y = block_1d_id / GroupNum;
// auto remap_block_1d_id =
// group_id_x <= big_group_num
// ? group_id_x * group_size + group_id_y
// : group_id_x * group_size + big_group_num - group_id_x + group_id_y;
// index_t idx_N0 = remap_block_1d_id % N0;
// index_t idx_M0 = remap_block_1d_id / N0;
// const auto M01_adapt = (idx_M0 < M0 - M0 % M01_) ? M01_ : M0 % M01_;
// index_t idx_M00 = idx_M0 / M01_;
// index_t idx_M01 = idx_M0 % M01_;
// index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0;
// /**
// * idxN0
// *
// * |< mtx N >|
// *
// * NPerBlock NPerBlock NPerBlock NPerBlock
// * N_0 N_1 N_2 N_3
// * - |-----------|-----------|-----------|-----|-----|-
// * ^ | - - 0 |/----> 2 | | | |
// * | | | / | | | | | M_0 MPerBlock
// * | M | /| | | | | |
// * |-0---|---/-|-----|-----|-----------|-----|-----|-
// * | 1 | / | | | blockid | | |
// * idxM0 | | | / | V | 5 | | | M_1 MPerBlock
// * | - V 1 | - 3 | | | |
// * |-----------|-----------|-----------|-----|-----|-
// * mtx M | | | | | |
// * | | | | | | M_2 MPerBlock
// * | | | | | |
// * |-----------|-----------|-----------|-----|-----|-
// * | | | | | |
// * | | | | | | M_3 MPerBlock
// * | | | | | |
// * |-----------|-----------|-----------|-----|-----|-
// * V | | | | | |
// * - |-----------|-----------|-----------|-----|-----|- M_4 MPerBlock
// * | | | | | |
// * |-----------|-----------|-----------|-----|-----|-
// * Example:
// * assume:
// * M0 = 5
// * N0 = 4
// * block_1d_id = 5
// * M01 = 2
// *
// * idx_N0 = 1
// * idx_M0 = 1
// * M01_adapt = 2
// * idx_M00 = 0
// * idx_M01 = 1
// * idx_N0_M01_local = 5
// * output {1, 2}
// */
// return make_tuple(idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
// idx_N0_M01_local / M01_adapt);
//}
} }
__host__ __device__ uint32_t get_workspace_size_for_acc(uint32_t acc_element_bytes) const __host__ __device__ uint32_t get_workspace_size_for_acc(uint32_t acc_element_bytes) const
......
...@@ -32,22 +32,13 @@ __global__ void ...@@ -32,22 +32,13 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
#endif #endif
// __attribute__((amdgpu_waves_per_eu(1, 1))) kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
{ {
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__)) #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
// auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>( GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
// karg.p_a_grid + splitk_batch_offset.a_k_split_offset, karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared, karg);
// karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
karg.p_a_grid,
karg.p_b_grid,
karg.p_c_grid,
p_shared,
karg);
#else #else
ignore = karg; ignore = karg;
#endif // end of if (defined(__gfx9__)) #endif // end of if (defined(__gfx9__))
...@@ -62,8 +53,7 @@ __global__ void ...@@ -62,8 +53,7 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
#endif #endif
// __attribute__((amdgpu_waves_per_eu(1, 1))) kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
{ {
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__)) #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
// Pass two lds pointer is the key to tell compiler that ds_read/write // Pass two lds pointer is the key to tell compiler that ds_read/write
...@@ -71,17 +61,8 @@ __global__ void ...@@ -71,17 +61,8 @@ __global__ void
__shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()]; __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()]; __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
// auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>( GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
// karg.p_a_grid + splitk_batch_offset.a_k_split_offset, karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared_0, p_shared_1, karg);
// karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
karg.p_a_grid,
karg.p_b_grid,
karg.p_c_grid,
p_shared_0,
p_shared_1,
karg);
#else #else
ignore = karg; ignore = karg;
#endif // end of if (defined(__gfx9__)) #endif // end of if (defined(__gfx9__))
...@@ -155,15 +136,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -155,15 +136,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>::selected_mfma.k_per_blk); MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
using ThisThreadBlock = ThisThreadBlock<BlockSize>; using ThisThreadBlock = ThisThreadBlock<BlockSize>;
// __host__ static auto CalculateGridSize(index_t M, index_t N) //, index_t KBatch)
// {
// // return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
// // return ((Block2CTileMap::CalculateGridSize(M, N)) * KBatch);
// // return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, 1);
// return Block2CTileMap::CalculateGridSize(M, N);
// }
__host__ static auto CalculateMPadded(index_t M) __host__ static auto CalculateMPadded(index_t M)
{ {
return math::integer_least_multiple(M, MPerBlock); return math::integer_least_multiple(M, MPerBlock);
...@@ -995,10 +967,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -995,10 +967,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
} }
else else
{ {
// constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
// auto K_t = KReadVec; if(karg.K <= 0)
// auto KReadPadSplited = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
if(karg.K <= 0) // HS
{ {
return false; return false;
} }
...@@ -1103,10 +1073,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1103,10 +1073,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
<< __FILE__ << ":" << __LINE__ << ", in function: " << __func__ << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
<< std::endl; << std::endl;
} }
// if(karg.KBatch > 1)
// {
// return false;
// }
} }
// check gridwise gemm pipeline // check gridwise gemm pipeline
...@@ -1152,16 +1118,12 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1152,16 +1118,12 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
return c_grid_desc_mblock_mperblock_nblock_nperblock; return c_grid_desc_mblock_mperblock_nblock_nperblock;
} }
// return block_id to C matrix tile idx (m0, n0) mapping
// if arch = gfx942
// using Block2CTileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
// using Block2CTileMap = BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>;
using Block2CTileMap_streamk = BlockToCTileMap_GemmStreamK_v2<MPerBlock, using Block2CTileMap_streamk = BlockToCTileMap_GemmStreamK_v2<MPerBlock,
NPerBlock, NPerBlock,
KPerBlock, KPerBlock,
StreamKReductionStrategy::Atomic, StreamKReductionStrategy::Atomic,
8, 8,
4>; // HS 4>;
template <bool HasMainKBlockLoop, template <bool HasMainKBlockLoop,
InMemoryDataOperationEnum CGlobalMemoryDataOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation,
...@@ -1177,43 +1139,39 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1177,43 +1139,39 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
const BElementwiseOperation b_element_op{}; const BElementwiseOperation b_element_op{};
const CElementwiseOperation c_element_op{}; const CElementwiseOperation c_element_op{};
// Provide a value for TileSwizzleSubM_
Block2CTileMap_streamk block_2_ctile_map_streamk(problem.M, Block2CTileMap_streamk block_2_ctile_map_streamk(problem.M,
problem.N, problem.N,
AK0Number * problem.KPadded, AK0Number * problem.KPadded,
problem.Grid_size, problem.Grid_size,
problem.Streamk_sel); // HS problem.Streamk_sel);
uint32_t iter_start, iter_end; // HS uint32_t iter_start, iter_end;
bool is_sk_block, is_dp_block; //, is_padding_block; //, is_reduction_block; // HS bool is_sk_block, is_dp_block;
index_t num_k_block_main_loop; // HS index_t num_k_block_main_loop;
for(auto block_idx = get_block_1d_id(); for(auto block_idx = get_block_1d_id();
block_idx < block_2_ctile_map_streamk.get_grid_dims(); block_idx < block_2_ctile_map_streamk.get_grid_dims();
block_idx += gridDim.x) block_idx += gridDim.x)
{ {
// for(unsigned int kbatch_id = 0; kbatch_id < static_cast<unsigned
// int>(problem.KBatch);
// kbatch_id++)
is_sk_block = is_sk_block =
static_cast<uint32_t>(block_idx) < block_2_ctile_map_streamk.sk_num_blocks; static_cast<uint32_t>(block_idx) < block_2_ctile_map_streamk.sk_num_blocks;
is_dp_block = is_dp_block =
static_cast<uint32_t>(block_idx) >= block_2_ctile_map_streamk.dp_start_block_idx && static_cast<uint32_t>(block_idx) >= block_2_ctile_map_streamk.dp_start_block_idx &&
static_cast<uint32_t>(block_idx) < static_cast<uint32_t>(block_idx) <
block_2_ctile_map_streamk.reduction_start_block_idx; // HS block_2_ctile_map_streamk.reduction_start_block_idx;
block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end); // HS block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end);
num_k_block_main_loop = iter_end - iter_start; // HS num_k_block_main_loop = iter_end - iter_start;
while(true) while(true)
{ {
uint32_t current_iter_length = __builtin_amdgcn_readfirstlane( uint32_t current_iter_length = __builtin_amdgcn_readfirstlane(
block_2_ctile_map_streamk.get_current_iter_length( block_2_ctile_map_streamk.get_current_iter_length(
iter_start, iter_end, num_k_block_main_loop)); // HS iter_start, iter_end, num_k_block_main_loop));
uint32_t tile_idx, iter_offset; // HS uint32_t tile_idx, iter_offset;
block_2_ctile_map_streamk.get_tile_idx_with_offset( block_2_ctile_map_streamk.get_tile_idx_with_offset(
iter_end - 1, tile_idx, iter_offset); // HS iter_end - 1, tile_idx, iter_offset);
iter_offset = iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1);
__builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1); // HS
const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(problem.M, const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(problem.M,
problem.MPadded, problem.MPadded,
...@@ -1237,17 +1195,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1237,17 +1195,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_a_grid /*+ splitk_batch_offset.a_k_split_offset*/, p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_b_grid /*+ splitk_batch_offset.b_k_split_offset*/, p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
// const auto block_work_idx =
// block_2_ctile_map.CalculateBottomIndex(make_multi_index(block_idx));
auto block_work_idx = auto block_work_idx =
block_2_ctile_map_streamk.tile_to_spatial(tile_idx, problem.M, problem.N); // HS block_2_ctile_map_streamk.tile_to_spatial(tile_idx, problem.M, problem.N);
const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]); const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]); const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
...@@ -1260,7 +1214,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1260,7 +1214,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
__builtin_amdgcn_readfirstlane(block_n_id * NPerBlock); __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
const index_t k0_block_data_idx_on_grid = const index_t k0_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(iter_offset * AK0Number); // HS __builtin_amdgcn_readfirstlane(iter_offset * AK0Number);
// lds max alignment // lds max alignment
constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number); constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
...@@ -1298,7 +1252,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1298,7 +1252,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
true, true,
BlockwiseGemmPipe::GlobalBufferNum>( BlockwiseGemmPipe::GlobalBufferNum>(
a_grid_desc_ak0_m_ak1, a_grid_desc_ak0_m_ak1,
make_multi_index(k0_block_data_idx_on_grid, m_block_data_idx_on_grid, 0), // HS make_multi_index(k0_block_data_idx_on_grid, m_block_data_idx_on_grid, 0),
a_element_op, a_element_op,
a_block_desc_ak0_m_ak1, a_block_desc_ak0_m_ak1,
make_multi_index(0, 0, 0), make_multi_index(0, 0, 0),
...@@ -1361,7 +1315,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1361,7 +1315,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
num_k_block_main_loop = __builtin_amdgcn_readfirstlane( num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
(a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) / (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
KPerBlock); // HS:AK0*KPadded/KPerBlock KPerBlock); :AK0*KPadded/KPerBlock
blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>( blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
a_grid_desc_ak0_m_ak1, a_grid_desc_ak0_m_ak1,
...@@ -1607,7 +1561,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1607,7 +1561,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
if(iter_end <= iter_start) if(iter_end <= iter_start)
break; break;
// make sure next loop LDS is ready for use // make sure next loop LDS is ready for use
block_sync_lds(); // HS block_sync_lds();
} }
} }
} }
...@@ -1627,13 +1581,11 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1627,13 +1581,11 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
const BElementwiseOperation b_element_op{}; const BElementwiseOperation b_element_op{};
const CElementwiseOperation c_element_op{}; const CElementwiseOperation c_element_op{};
Block2CTileMap_streamk block_2_ctile_map_streamk(problem.M, Block2CTileMap_streamk block_2_ctile_map_streamk(
problem.N, problem.M, problem.N, AK0Number * problem.KPadded, problem.Grid_size);
AK0Number * problem.KPadded, uint32_t iter_start, iter_end;
problem.Grid_size); // HS bool is_sk_block, is_dp_block; //, is_padding_block; //, is_reduction_block;
uint32_t iter_start, iter_end; // HS index_t num_k_block_main_loop;
bool is_sk_block, is_dp_block; //, is_padding_block; //, is_reduction_block; // HS
index_t num_k_block_main_loop; // HS
for(auto block_idx = get_block_1d_id(); for(auto block_idx = get_block_1d_id();
block_idx < block_2_ctile_map_streamk.get_grid_dims(); block_idx < block_2_ctile_map_streamk.get_grid_dims();
...@@ -1644,21 +1596,20 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1644,21 +1596,20 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
is_dp_block = is_dp_block =
static_cast<uint32_t>(block_idx) >= block_2_ctile_map_streamk.dp_start_block_idx && static_cast<uint32_t>(block_idx) >= block_2_ctile_map_streamk.dp_start_block_idx &&
static_cast<uint32_t>(block_idx) < static_cast<uint32_t>(block_idx) <
block_2_ctile_map_streamk.reduction_start_block_idx; // HS block_2_ctile_map_streamk.reduction_start_block_idx;
block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end); // HS block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end);
num_k_block_main_loop = iter_end - iter_start; // HS num_k_block_main_loop = iter_end - iter_start;
{ {
uint32_t current_iter_length = __builtin_amdgcn_readfirstlane( uint32_t current_iter_length = __builtin_amdgcn_readfirstlane(
block_2_ctile_map_streamk.get_current_iter_length( block_2_ctile_map_streamk.get_current_iter_length(
iter_start, iter_end, num_k_block_main_loop)); // HS iter_start, iter_end, num_k_block_main_loop));
uint32_t tile_idx, iter_offset; // HS uint32_t tile_idx, iter_offset;
block_2_ctile_map_streamk.get_tile_idx_with_offset( block_2_ctile_map_streamk.get_tile_idx_with_offset(
iter_end - 1, tile_idx, iter_offset); // HS iter_end - 1, tile_idx, iter_offset);
iter_offset = iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1);
__builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1); // HS
const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(problem.M, const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(problem.M,
problem.MPadded, problem.MPadded,
...@@ -1683,16 +1634,12 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1683,16 +1634,12 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_a_grid /*+ splitk_batch_offset.a_k_split_offset*/, p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_b_grid /*+ splitk_batch_offset.b_k_split_offset*/, p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
// const auto block_work_idx =
// block_2_ctile_map.CalculateBottomIndex(make_multi_index(block_idx));
auto block_work_idx = auto block_work_idx =
block_2_ctile_map_streamk.tile_to_spatial(tile_idx, problem.M, problem.N); // HS block_2_ctile_map_streamk.tile_to_spatial(tile_idx, problem.M, problem.N);
const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]); const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]); const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
...@@ -1704,7 +1651,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1704,7 +1651,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
const index_t n_block_data_idx_on_grid = const index_t n_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(block_n_id * NPerBlock); __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
const index_t k0_block_data_idx_on_grid = const index_t k0_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(iter_offset * AK0Number); // HS __builtin_amdgcn_readfirstlane(iter_offset * AK0Number);
// lds max alignment // lds max alignment
constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number); constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
...@@ -1742,7 +1689,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1742,7 +1689,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
true, true,
BlockwiseGemmPipe::GlobalBufferNum>( BlockwiseGemmPipe::GlobalBufferNum>(
a_grid_desc_ak0_m_ak1, a_grid_desc_ak0_m_ak1,
make_multi_index(k0_block_data_idx_on_grid, m_block_data_idx_on_grid, 0), // HS make_multi_index(k0_block_data_idx_on_grid, m_block_data_idx_on_grid, 0),
a_element_op, a_element_op,
a_block_desc_ak0_m_ak1, a_block_desc_ak0_m_ak1,
make_multi_index(0, 0, 0), make_multi_index(0, 0, 0),
...@@ -1773,7 +1720,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1773,7 +1720,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
true, true,
BlockwiseGemmPipe::GlobalBufferNum>( BlockwiseGemmPipe::GlobalBufferNum>(
b_grid_desc_bk0_n_bk1, b_grid_desc_bk0_n_bk1,
make_multi_index(k0_block_data_idx_on_grid, n_block_data_idx_on_grid, 0), // HS make_multi_index(k0_block_data_idx_on_grid, n_block_data_idx_on_grid, 0),
b_element_op, b_element_op,
b_block_desc_bk0_n_bk1, b_block_desc_bk0_n_bk1,
make_multi_index(0, 0, 0), make_multi_index(0, 0, 0),
......
...@@ -21,70 +21,6 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES ...@@ -21,70 +21,6 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES
device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp)
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_default_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_default_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_default_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
# device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_default_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_default_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v1_default_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
# device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
# device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
)
add_instance_library(device_gemm_universal_streamk_instance ${GEMM_UNIVERSAL_STREAMK_INSTANCES}) add_instance_library(device_gemm_universal_streamk_instance ${GEMM_UNIVERSAL_STREAMK_INSTANCES})
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" #include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" #include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" #include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" #include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" #include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" #include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" #include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" #include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" #include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp" #include "device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment