Commit 38504cf4 authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Add readfirstlane() to copy content into SGPRs

parent aafba9b4
...@@ -510,20 +510,20 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 ...@@ -510,20 +510,20 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif #endif
const auto a_grid_desc_k0_m_k1 = const auto a_grid_desc_k0_m_k1 = readfirstlane(
MakeAGridDescriptor_K0_M_K1(karg.M, karg.MPadded, karg.K, karg.K0, karg.StrideA); MakeAGridDescriptor_K0_M_K1(karg.M, karg.MPadded, karg.K, karg.K0, karg.StrideA));
const auto b_grid_desc_k0_n_k1 = const auto b_grid_desc_k0_n_k1 = readfirstlane(
MakeBGridDescriptor_K0_N_K1(karg.K, karg.N, karg.NPadded, karg.K0, karg.StrideB); MakeBGridDescriptor_K0_N_K1(karg.K, karg.N, karg.NPadded, karg.K0, karg.StrideB));
const auto c_grid_desc_m_n = const auto c_grid_desc_m_n = readfirstlane(
MakeCGridDescriptor_M_N(karg.M, karg.MPadded, karg.N, karg.NPadded, karg.StrideC); MakeCGridDescriptor_M_N(karg.M, karg.MPadded, karg.N, karg.NPadded, karg.StrideC));
const auto c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2 = const auto c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n); MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(*c_grid_desc_m_n);
const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize()); p_a_grid, a_grid_desc_k0_m_k1->GetElementSpaceSize());
const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize()); p_b_grid, b_grid_desc_k0_n_k1->GetElementSpaceSize());
auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_c_grid, c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetElementSpaceSize()); p_c_grid, c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetElementSpaceSize());
...@@ -572,7 +572,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 ...@@ -572,7 +572,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
ABlockTransferThreadClusterArrangeOrder, ABlockTransferThreadClusterArrangeOrder,
FloatAB, FloatAB,
FloatABAdjusted, FloatABAdjusted,
decltype(a_grid_desc_k0_m_k1), decltype(*a_grid_desc_k0_m_k1),
decltype(a_block_desc_k0_m_k1), decltype(a_block_desc_k0_m_k1),
ABlockTransferSrcAccessOrder, ABlockTransferSrcAccessOrder,
Sequence<1, 0, 2>, Sequence<1, 0, 2>,
...@@ -585,7 +585,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 ...@@ -585,7 +585,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
AThreadTransferSrcResetCoordinateAfterRun, AThreadTransferSrcResetCoordinateAfterRun,
true, true,
NumGemmKPrefetchStage>( NumGemmKPrefetchStage>(
a_grid_desc_k0_m_k1, *a_grid_desc_k0_m_k1,
make_multi_index(0, m_block_data_idx_on_grid, 0), make_multi_index(0, m_block_data_idx_on_grid, 0),
a_element_op, a_element_op,
a_block_desc_k0_m_k1, a_block_desc_k0_m_k1,
...@@ -603,7 +603,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 ...@@ -603,7 +603,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
BBlockTransferThreadClusterArrangeOrder, BBlockTransferThreadClusterArrangeOrder,
FloatAB, FloatAB,
FloatABAdjusted, FloatABAdjusted,
decltype(b_grid_desc_k0_n_k1), decltype(*b_grid_desc_k0_n_k1),
decltype(b_block_desc_k0_n_k1), decltype(b_block_desc_k0_n_k1),
BBlockTransferSrcAccessOrder, BBlockTransferSrcAccessOrder,
Sequence<1, 0, 2>, Sequence<1, 0, 2>,
...@@ -616,7 +616,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 ...@@ -616,7 +616,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
BThreadTransferSrcResetCoordinateAfterRun, BThreadTransferSrcResetCoordinateAfterRun,
true, true,
NumGemmKPrefetchStage>( NumGemmKPrefetchStage>(
b_grid_desc_k0_n_k1, *b_grid_desc_k0_n_k1,
make_multi_index(0, n_block_data_idx_on_grid, 0), make_multi_index(0, n_block_data_idx_on_grid, 0),
b_element_op, b_element_op,
b_block_desc_k0_n_k1, b_block_desc_k0_n_k1,
...@@ -665,13 +665,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 ...@@ -665,13 +665,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
#if ENABLE_DUMP_CLOCK #if ENABLE_DUMP_CLOCK
long loop_start = 0, loop_end = 0; long loop_start = 0, loop_end = 0;
#endif #endif
GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_k0_m_k1, GridwiseGemmPipe::template Run<HasMainKBlockLoop>(*a_grid_desc_k0_m_k1,
a_block_desc_k0_m_k1, a_block_desc_k0_m_k1,
a_blockwise_copy, a_blockwise_copy,
a_grid_buf, a_grid_buf,
a_block_buf, a_block_buf,
a_block_slice_copy_step, a_block_slice_copy_step,
b_grid_desc_k0_n_k1, *b_grid_desc_k0_n_k1,
b_block_desc_k0_n_k1, b_block_desc_k0_n_k1,
b_blockwise_copy, b_blockwise_copy,
b_grid_buf, b_grid_buf,
......
...@@ -57,4 +57,36 @@ __host__ __device__ constexpr Y bit_cast(const X& x) ...@@ -57,4 +57,36 @@ __host__ __device__ constexpr Y bit_cast(const X& x)
#endif #endif
} }
namespace detail {
template <typename T>
struct sgpr_ptr
{
static_assert(!std::is_const_v<T> && !std::is_reference_v<T> &&
std::is_trivially_copyable_v<T>);
__device__ explicit sgpr_ptr(const T& obj) noexcept
{
/// TODO: copy object content into member 'memory' by __builtin_amdgcn_readfirstlane()
__builtin_memcpy(memory, &obj, sizeof(obj));
}
__device__ T& operator*() { return *(this->operator->()); }
__device__ const T& operator*() const { return *(this->operator->()); }
__device__ T* operator->() { return reinterpret_cast<T*>(memory); }
__device__ const T* operator->() const { return reinterpret_cast<const T*>(memory); }
private:
alignas(T) unsigned char memory[sizeof(T) + 3];
};
} // namespace detail
template <typename T>
__device__ constexpr auto readfirstlane(const T& obj)
{
return detail::sgpr_ptr<T>(obj);
}
} // namespace ck } // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment