/*************************************************************************************************** * Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved. * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ /*! \file \brief Unit testbed for kernel-level GEMM */ #pragma once #include "hip/hip_runtime.h" #include #include "../../common/hytlass_unit_test.h" #include "hytlass/aligned_buffer.h" #include "hytlass/gemm/gemm.h" #include "hytlass/layout/matrix.h" #include "hytlass/layout/vector.h" #include "hytlass/numeric_types.h" #include "hytlass/core_io.h" #include "hytlass/util/host_tensor.h" #include "hytlass/util/tensor_view_io.h" #include "hytlass/util/distribution.h" #include "hytlass/util/reference/host/gemm.h" #include "hytlass/util/reference/host/tensor_compare.h" #include "hytlass/util/reference/host/tensor_fill.h" #include "hytlass/gemm/threadblock/default_mma_core_simt.h" #include "hytlass/gemm/threadblock/default_mma_core_gfx928.h" #include "hytlass/transform/threadblock/predicated_tile_iterator.h" #include "hytlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h" #include "hytlass/hytlass.h" #include "hytlass/platform/platform.h" namespace test { namespace gemm { namespace threadblock { ///////////////////////////////////////////////////////////////////////////////////////////////// template __launch_bounds__(Mma::WarpCount::kCount * WARP_SIZE_GPU) __global__ void kernel_mma(hytlass::gemm::GemmCoord problem_size, typename Mma::IteratorA::Params params_A, typename Mma::IteratorA::TensorRef ref_A, typename Mma::IteratorB::Params params_B, typename Mma::IteratorB::TensorRef ref_B, typename Mma::ElementC *ptr_C, typename Mma::LayoutC::Stride::Index ldc) { // Shared storage needed by threadblock-scoped matrix multiply-accumulate __shared__ typename Mma::SharedStorage shared_storage; // Compute threadblock location hytlass::gemm::GemmCoord tb_tile_offset = {int(blockIdx.x), int(blockIdx.y), 0}; hytlass::MatrixCoord tb_offset_A{tb_tile_offset.m() * Mma::Shape::kM, tb_tile_offset.k()}; hytlass::MatrixCoord tb_offset_B{tb_tile_offset.k(), tb_tile_offset.n() * Mma::Shape::kN}; // Compute position within threadblock int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x; // Construct iterators to A and B operands typename Mma::IteratorA iterator_A(params_A, ref_A.data(), {problem_size.m(), problem_size.k()}, tb_thread_id, tb_offset_A); typename Mma::IteratorB iterator_B(params_B, ref_B.data(), {problem_size.k(), problem_size.n()}, tb_thread_id, tb_offset_B); int warp_id = threadIdx.y; int lane_id = threadIdx.x; // Construct thread-scoped matrix multiply Mma mma(shared_storage, tb_thread_id, warp_id, threadIdx.x); typename Mma::FragmentC accum; accum.clear(); int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK; // Compute threadblock-scoped matrix multiply-add mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum); // Output results typename Mma::Operator::IteratorC iterator_C({ptr_C, ldc}, lane_id); iterator_C.add_tile_offset( {(tb_tile_offset.m() * Mma::WarpCount::kM) + (warp_id % Mma::WarpCount::kM), (tb_tile_offset.n() * Mma::WarpCount::kN) + (warp_id / Mma::WarpCount::kM)}); iterator_C.store(accum); } ///////////////////////////////////////////////////////////////////////////////////////////////// /// Structure to compute the matrix product template < /// Threadblock-level matrix multiply-accumulate typename MmaCore_, /// Number of stages int Stages = 2> struct Testbed { /// Threadblock-level GEMM implementation using MmaCore = MmaCore_; using ThreadblockShape = typename MmaCore::Shape; using WarpShape = typename MmaCore::WarpShape; using InstructionShape = typename MmaCore::InstructionShape; using ElementA = typename MmaCore::ElementA; using LayoutA = typename MmaCore::LayoutA; using ElementB = typename MmaCore::ElementB; using LayoutB = typename MmaCore::LayoutB; using ElementC = typename MmaCore::ElementC; using LayoutC = typename MmaCore::LayoutC; static const int kStages = Stages; // Define iterators over tiles from the A operand static const bool use_idp4a = hytlass::platform::is_same::value && hytlass::platform::is_same::value && hytlass::platform::is_same::value && hytlass::platform::is_same >::value; static const bool use_dot2_f32_f16 = hytlass::platform::is_same::value && hytlass::platform::is_same::value && hytlass::platform::is_same::value && hytlass::platform::is_same >::value; static const bool transposeA = hytlass::platform::is_same< LayoutA, hytlass::layout::ColumnMajor >::value; static const bool transposeB = hytlass::platform::is_same< LayoutB, hytlass::layout::RowMajor >::value; using IteratorA = typename hytlass::platform::conditional< use_idp4a || use_dot2_f32_f16, hytlass::transform::threadblock::PredicatedTileIterator2dThreadTile< hytlass::MatrixShape, ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, transposeA> , hytlass::transform::threadblock::PredicatedTileIterator< hytlass::MatrixShape, ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA> >::type; // Define iterators over tiles from the B operand using IteratorB = typename hytlass::platform::conditional< use_idp4a || use_dot2_f32_f16, hytlass::transform::threadblock::PredicatedTileIterator2dThreadTile< hytlass::MatrixShape, ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, transposeB> , hytlass::transform::threadblock::PredicatedTileIterator< hytlass::MatrixShape, ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB> >::type; // Define MmaPipeline Single Stage using MmaPipelineSingleStage = hytlass::gemm::threadblock::MmaSingleStage< typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, IteratorB, typename MmaCore::SmemIteratorB, ElementC, LayoutC, typename MmaCore::MmaPolicy>; // Define MmaPipeline Two Stages using MmaPipelineTwoStages = hytlass::gemm::threadblock::MmaPipelined< typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, IteratorB, typename MmaCore::SmemIteratorB, ElementC, LayoutC, typename MmaCore::MmaPolicy>; // Define the threadblock-scoped pipelined matrix multiply (Select between Single vs. Two stages) using Mma = typename hytlass::platform::conditional<(kStages==1), MmaPipelineSingleStage, MmaPipelineTwoStages>::type; // // Data members // hytlass::HostTensor matrix_A; hytlass::HostTensor matrix_B; hytlass::HostTensor matrix_C_computed; hytlass::HostTensor matrix_C_reference; hytlass::gemm::GemmCoord problem_size; float alpha, beta; // // Methods // /// Allocates workspace in device memory Testbed(int m, int n, int k, float alpha_, float beta_) : problem_size(m, n, k), alpha(alpha_), beta(beta_) { matrix_A.reset(hytlass::make_Coord(m, k)); matrix_B.reset(hytlass::make_Coord(k, n)); matrix_C_computed.reset(hytlass::make_Coord(m, n)); matrix_C_reference.reset(hytlass::make_Coord(m, n), false); } bool sufficient() { return true; } /// Runs the test bool run( dim3 grid, dim3 block, hytlass::Distribution::Kind init_A = hytlass::Distribution::Uniform, hytlass::Distribution::Kind init_B = hytlass::Distribution::Uniform) { // Waive test if insufficient HIP device if (!sufficient()) { if (HYTLASS_TEST_UNIT_ENABLE_WARNINGS) { std::cerr << "Test waived due to insufficient HIP device." << std::endl; } return true; } // // initialize device memory // if (init_A == hytlass::Distribution::Uniform) { int scope_max = 8; int scope_min = -8; if (hytlass::sizeof_bits::value == 4) { scope_max = 2; scope_min = -2; } else if (hytlass::sizeof_bits::value == 1) { scope_max = 2; scope_min = 0; } uint64_t seed = 7; hytlass::reference::host::TensorFillRandomUniform( matrix_A.host_view(), seed, scope_max, scope_min, 0); } else if (init_A == hytlass::Distribution::Sequential) { hytlass::reference::host::BlockFillSequential(matrix_A.host_data(), matrix_A.capacity()); } else if (init_A == hytlass::Distribution::Identity) { hytlass::reference::host::TensorFillIdentity(matrix_A.host_view()); } else { return false; } if (init_B == hytlass::Distribution::Uniform) { int scope_max = 8; int scope_min = -8; if (hytlass::sizeof_bits::value == 4) { scope_max = 2; scope_min = -2; } else if (hytlass::sizeof_bits::value == 1) { scope_max = 2; scope_min = 0; } uint64_t seed = 7; hytlass::reference::host::TensorFillRandomUniform( matrix_B.host_view(), seed + 16, scope_max, scope_min, 0); } else if (init_B == hytlass::Distribution::Sequential) { hytlass::reference::host::BlockFillSequential(matrix_B.host_data(), matrix_B.capacity()); } else if (init_B == hytlass::Distribution::Identity) { hytlass::reference::host::TensorFillIdentity(matrix_B.host_view()); } else { return false; } hytlass::reference::host::TensorFill(matrix_C_computed.host_view()); hytlass::reference::host::TensorFill(matrix_C_reference.host_view()); matrix_A.sync_device(); matrix_B.sync_device(); matrix_C_computed.sync_device(); typename IteratorA::Params params_A(matrix_A.layout()); typename IteratorB::Params params_B(matrix_B.layout()); test::gemm::threadblock::kernel_mma<<>>( problem_size, params_A, matrix_A.device_ref(), params_B, matrix_B.device_ref(), matrix_C_computed.device_data(), matrix_C_computed.layout().stride(0)); // // Check error code // hipError_t result = hipDeviceSynchronize(); EXPECT_EQ(result, hipSuccess) << " kernel error: " << hipGetErrorString(result) << " on device " << GetHipDevice(); matrix_C_computed.sync_host(); hytlass::reference::host::Gemm reference_gemm; reference_gemm( problem_size, ElementC(alpha), matrix_A.host_view(), matrix_B.host_view(), ElementC(beta), matrix_C_reference.host_view()); bool passed = hytlass::reference::host::TensorEquals( matrix_C_computed.host_view(), matrix_C_reference.host_view()); EXPECT_TRUE(passed) << "Failed on device " << GetHipDevice(); if (!passed) { std::ofstream output("mma_pipelined_testbed_errors.txt"); output << "A:\n" << matrix_A.host_view() << "\n" << "B:\n" << matrix_B.host_view() << "\n" << "Reference:\n" << matrix_C_reference.host_view() << "\n" << "Computed:\n" << matrix_C_computed.host_view() << "\n"; } return passed; } }; ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace threadblock } // namespace gemm } // namespace test