Unverified Commit f83e9701 authored by Haocong WANG's avatar Haocong WANG Committed by GitHub
Browse files

[GEMM] Gemm universal device operation (#1154)



* Optimize GEMM on MI200/300:
1. Add new blockwise gemm pipeline
2. Add irregular splitk intances

* clang format + typo fix

* Fix a bug

* initial commit

* Add more instances to irregular splitk

* blkgemm pipeline v1~4 prototype

* Sanity Checked. Known issue:
1. Poor performance of splitk
2. Register spill on blkgemmpipeline v3

* Sanity and Performance fix:
1. fix a bug related to sanity in grouped b2c mapping
2. fix a bug related to sanity and performance in splitk offset

* Sanity and API update:
1. Remove prefetch stage
2. Fix valid check bug
3, Add first gemm_universal instance into ckProfiler

* Add NN instances for gemm universal

* 1. Add NT instances for gemm_universal
2. Fix a bug about Kpadding in gemm_universal

* Fix a bug regarding padding Odd K number

* remove kernel print

* Fix KPadding bug...

* Update safety check

* another try to fix kpadding..

* Sanity checked

* new instances..

* clang format+typo fix

* remove clang format script's change

* Add non-hotloop compile option

* 1. Add fp16xfp8 example
2. pull packed convert f8 from pr1150

* Some miscs.. opt and fix

* Add pipeline description docs

* Split universal gemm instance library to cut profiler compiling time

* uncomment cmakefile

* Fix a bug caused by blockwise_gemm_pipe_v2

* reduce default splitk to 1

* Add 224x256x64 tile size

* update, including:
1. Experiment pipeline 5~7
2. Optimization for pipeline 4
3. Organized instance library

* temp save

* temp save

* Permuted lds layout, sanity and function checked

* clang format

* Move OOB check from RunRead to RunWrite, for better software pipeline.
TODO: agpr spill when NN layout

* clangformat

* A/B splitpipe scheduler for v3

* Fix two bugs

* bug fix

* fix a bug in oob check

* Example for mixed fp16_fp8 gemm

* Clean experimental code blocks

* Add mixed precision gemm into profiler

* tempsave

* optimize m/n major lds layout

* Add RRR GEMM  mixed precision instances

* Optimize f8 matrix transpose

* Add test_gemm_universal

* A/B spilt schedule for blkpip v5

* Take ds_read2 into iglp scheduling scheme

* format

* fixed cmake

* Add llvm-option into CI cmake flag

---------
Co-authored-by: default avatarJing Zhang <jizhan@amd.com>
parent 7cdf5a96
...@@ -259,46 +259,20 @@ struct BlockToCTileMap_M00_N0_M01Adapt : BlockToCTileMap_M00_N0_M01Adapt<MPerBlo ...@@ -259,46 +259,20 @@ struct BlockToCTileMap_M00_N0_M01Adapt : BlockToCTileMap_M00_N0_M01Adapt<MPerBlo
BlockToCTileMap_M00_N0_M01Adapt; BlockToCTileMap_M00_N0_M01Adapt;
}; };
// Rows of column-vectors // Grouped Rows of column-vectors WGP mapping
// This C-tile map dynamically adjusts M01 when C-tile index is out of range // Optimized for MI300-like multipe-die chip
template <index_t GroupNum, index_t MPerBlock, index_t NPerBlock, typename CGridDesc_M_N = void>
struct BlockToCTileMap_Grouped_M00_N0_M01Adapt;
template <index_t GroupNum, index_t MPerBlock, index_t NPerBlock> template <index_t GroupNum, index_t MPerBlock, index_t NPerBlock>
struct BlockToCTileMap_Grouped_M00_N0_M01Adapt<GroupNum, MPerBlock, NPerBlock, void> struct BlockToCTileMap_Grouped_M00_N0_M01Adapt
{ {
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{}; static constexpr auto I1 = Number<1>{};
__host__ __device__ BlockToCTileMap_Grouped_M00_N0_M01Adapt() = default;
__host__ __device__ BlockToCTileMap_Grouped_M00_N0_M01Adapt(
const BlockToCTileMap_Grouped_M00_N0_M01Adapt&) = default;
__host__ __device__
BlockToCTileMap_Grouped_M00_N0_M01Adapt(BlockToCTileMap_Grouped_M00_N0_M01Adapt&&) = default;
__host__ __device__ BlockToCTileMap_Grouped_M00_N0_M01Adapt&
operator=(const BlockToCTileMap_Grouped_M00_N0_M01Adapt&) = default;
__host__ __device__ BlockToCTileMap_Grouped_M00_N0_M01Adapt&
operator=(BlockToCTileMap_Grouped_M00_N0_M01Adapt&&) = default;
__host__ __device__ BlockToCTileMap_Grouped_M00_N0_M01Adapt(index_t M, __host__ __device__ BlockToCTileMap_Grouped_M00_N0_M01Adapt(index_t M,
index_t N, index_t N,
index_t M01 = 8) index_t M01 = 8)
: M_(M), N_(N), M01_(M01) : M_(M), N_(N), M01_(M01)
{ {
#if 0
if(get_thread_global_1d_id()==0){
printf("Ctor called, M= %d, N= %d, M01 = %d\n", M_, N_, M01_);
}
#endif
}
template <typename CGridDesc_M_N>
__host__ __device__
BlockToCTileMap_Grouped_M00_N0_M01Adapt(const CGridDesc_M_N& c_grid_desc_m_n, index_t M01 = 8)
: BlockToCTileMap_Grouped_M00_N0_M01Adapt(
c_grid_desc_m_n.GetLength(I0), c_grid_desc_m_n.GetLength(I1), M01)
{
} }
__host__ static constexpr index_t CalculateGridSize(index_t M, index_t N) __host__ static constexpr index_t CalculateGridSize(index_t M, index_t N)
...@@ -309,12 +283,6 @@ struct BlockToCTileMap_Grouped_M00_N0_M01Adapt<GroupNum, MPerBlock, NPerBlock, v ...@@ -309,12 +283,6 @@ struct BlockToCTileMap_Grouped_M00_N0_M01Adapt<GroupNum, MPerBlock, NPerBlock, v
return M0 * N0; return M0 * N0;
} }
template <typename CGridDesc_M_N>
__host__ static constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n)
{
return CalculateGridSize(c_grid_desc_m_n.GetLength(I0), c_grid_desc_m_n.GetLength(I1));
}
template <typename CGridDesc_M_N> template <typename CGridDesc_M_N>
__host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const __host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const
{ {
...@@ -329,11 +297,25 @@ struct BlockToCTileMap_Grouped_M00_N0_M01Adapt<GroupNum, MPerBlock, NPerBlock, v ...@@ -329,11 +297,25 @@ struct BlockToCTileMap_Grouped_M00_N0_M01Adapt<GroupNum, MPerBlock, NPerBlock, v
const auto M0 = math::integer_divide_ceil(M_, MPerBlock); const auto M0 = math::integer_divide_ceil(M_, MPerBlock);
const auto N0 = math::integer_divide_ceil(N_, NPerBlock); const auto N0 = math::integer_divide_ceil(N_, NPerBlock);
block_1d_id = block_1d_id % (M0 * N0); // swallow batch index if(M0 == 1)
{
return make_tuple(0, block_1d_id);
}
else if(N0 == 1)
{
return make_tuple(block_1d_id, 0);
}
// block_1d_id = block_1d_id % (M0 * N0); // swallow batch index
else
{
const auto group_size = math::integer_divide_ceil(M0 * N0, GroupNum); const auto group_size = math::integer_divide_ceil(M0 * N0, GroupNum);
auto group_id = block_1d_id % GroupNum; const auto big_group_num = GroupNum - (group_size * GroupNum - M0 * N0);
auto remap_block_1d_id = group_id * group_size + block_1d_id / GroupNum; auto group_id_x = block_1d_id % GroupNum;
auto group_id_y = block_1d_id / GroupNum;
auto remap_block_1d_id =
group_id_x <= big_group_num
? group_id_x * group_size + group_id_y
: group_id_x * group_size + big_group_num - group_id_x + group_id_y;
index_t idx_N0 = remap_block_1d_id % N0; index_t idx_N0 = remap_block_1d_id % N0;
index_t idx_M0 = remap_block_1d_id / N0; index_t idx_M0 = remap_block_1d_id / N0;
...@@ -391,6 +373,7 @@ struct BlockToCTileMap_Grouped_M00_N0_M01Adapt<GroupNum, MPerBlock, NPerBlock, v ...@@ -391,6 +373,7 @@ struct BlockToCTileMap_Grouped_M00_N0_M01Adapt<GroupNum, MPerBlock, NPerBlock, v
return make_tuple(idx_N0_M01_local % M01_adapt + idx_M00 * M01_, return make_tuple(idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
idx_N0_M01_local / M01_adapt); idx_N0_M01_local / M01_adapt);
} }
}
template <typename CTileIdx, typename CTileDim> template <typename CTileIdx, typename CTileDim>
__host__ __device__ bool ValidCTileIndex(const CTileIdx& /* c_tile_idx */, __host__ __device__ bool ValidCTileIndex(const CTileIdx& /* c_tile_idx */,
...@@ -405,15 +388,6 @@ struct BlockToCTileMap_Grouped_M00_N0_M01Adapt<GroupNum, MPerBlock, NPerBlock, v ...@@ -405,15 +388,6 @@ struct BlockToCTileMap_Grouped_M00_N0_M01Adapt<GroupNum, MPerBlock, NPerBlock, v
index_t M01_; index_t M01_;
}; };
// keep the redundant type argument for backward compatibility
template <index_t GroupNum, index_t MPerBlock, index_t NPerBlock, typename CGridDesc_M_N>
struct BlockToCTileMap_Grouped_M00_N0_M01Adapt
: BlockToCTileMap_Grouped_M00_N0_M01Adapt<GroupNum, MPerBlock, NPerBlock, void>
{
using BlockToCTileMap_Grouped_M00_N0_M01Adapt<GroupNum, MPerBlock, NPerBlock, void>::
BlockToCTileMap_Grouped_M00_N0_M01Adapt;
};
// columns of row-vectors // columns of row-vectors
// This C-tile map dynamically adjusts N01 when C-tile index is out of range // This C-tile map dynamically adjusts N01 when C-tile index is out of range
template <index_t MPerBlock, index_t NPerBlock, typename CGridDesc_M_N = void> template <index_t MPerBlock, index_t NPerBlock, typename CGridDesc_M_N = void>
......
This diff is collapsed.
...@@ -163,6 +163,13 @@ struct scalar_type<bf8_t> ...@@ -163,6 +163,13 @@ struct scalar_type<bf8_t>
static constexpr index_t vector_size = 1; static constexpr index_t vector_size = 1;
}; };
template <>
struct scalar_type<bool>
{
using type = bool;
static constexpr index_t vector_size = 1;
};
template <typename T> template <typename T>
struct vector_type<T, 1> struct vector_type<T, 1>
{ {
......
...@@ -10,10 +10,12 @@ namespace ck { ...@@ -10,10 +10,12 @@ namespace ck {
__device__ void block_sync_lds() __device__ void block_sync_lds()
{ {
#if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM #if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
asm volatile("\ // asm volatile("\
s_waitcnt lgkmcnt(0) \n \ // s_waitcnt lgkmcnt(0) \n \
s_barrier \ // s_barrier \
" ::); // " ::);
__builtin_amdgcn_s_waitcnt(0xc07f);
__builtin_amdgcn_s_barrier();
#else #else
__syncthreads(); __syncthreads();
#endif #endif
......
This diff is collapsed.
...@@ -43,6 +43,8 @@ __host__ __device__ constexpr Y bit_cast(const X& x) ...@@ -43,6 +43,8 @@ __host__ __device__ constexpr Y bit_cast(const X& x)
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST #if CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST
Y y; Y y;
// auto t = reinterpret_cast<const Y*>(&x);
// y = *t;
__builtin_memcpy(&y, &x, sizeof(X)); __builtin_memcpy(&y, &x, sizeof(X));
return y; return y;
......
...@@ -21,7 +21,7 @@ template <typename ADataType, ...@@ -21,7 +21,7 @@ template <typename ADataType,
typename AElementwiseOperation, typename AElementwiseOperation,
typename BElementwiseOperation, typename BElementwiseOperation,
typename CElementwiseOperation, typename CElementwiseOperation,
typename ComputeTypeA = ADataType, typename ComputeTypeA = CDataType,
typename ComputeTypeB = ComputeTypeA> typename ComputeTypeB = ComputeTypeA>
struct ReferenceGemm : public device::BaseOperator struct ReferenceGemm : public device::BaseOperator
{ {
......
...@@ -31,6 +31,18 @@ struct GeneratorTensor_1 ...@@ -31,6 +31,18 @@ struct GeneratorTensor_1
} }
}; };
template <>
struct GeneratorTensor_1<ck::half_t>
{
float value = 1.0;
template <typename... Is>
ck::bhalf_t operator()(Is...)
{
return ck::type_convert<ck::half_t>(value);
}
};
template <> template <>
struct GeneratorTensor_1<ck::bhalf_t> struct GeneratorTensor_1<ck::bhalf_t>
{ {
...@@ -43,6 +55,20 @@ struct GeneratorTensor_1<ck::bhalf_t> ...@@ -43,6 +55,20 @@ struct GeneratorTensor_1<ck::bhalf_t>
} }
}; };
#if defined CK_ENABLE_FP8
template <>
struct GeneratorTensor_1<ck::f8_t>
{
float value = 1.0;
template <typename... Is>
ck::bhalf_t operator()(Is...)
{
return ck::type_convert<ck::f8_t>(value);
}
};
#endif
template <> template <>
struct GeneratorTensor_1<int8_t> struct GeneratorTensor_1<int8_t>
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment