/*************************************************************************************************** * Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ #include "hip/hip_runtime.h" #include #include #include #include #include #include "hute/layout.hpp" #include #include "hytlass/util/print_error.hpp" #include "hytlass/util/GPU_Clock.hpp" #include "hytlass/util/helper_hip.hpp" #if defined(HYTLASS_ENABLE_HIPBLAS) && HYTLASS_ENABLE_HIPBLAS != 0 # include "hytlass/util/hipblas_wrappers.hpp" #endif template void cpu_gemm(T *c, const T &a, const T &b) { using namespace hute; using ValueType = typename T::value_type; int m = size<0>(a); int n = size<0>(b); int k = size<1>(a); for (int i = 0; i < m; ++i) { for (int j = 0; j < n; ++j) { float s = 0.f; for (int kk = 0; kk < k; ++kk) { float v1 = a(i, kk); float v2 = b(j, kk); s += v1 * v2; } (*c)(i, j) = ValueType(s); } } } template void cpu_compare(const T &x, const T &y, float threshold) { using namespace hute; if (size(x) != size(y)) { fprintf(stderr, "lenght not equal x = %d, y = %d\n", size(x), size(y)); exit(9); } int n = size(x); float diff_max = 0; int diff_count = 0; for (int i = 0; i < n; ++i) { float v0 = x(i); float v1 = y(i); // printf("[%d]: %f %f\n",i,v0,v1); diff_max = max(diff_max, fabs(v0 - v1)); if (fabs(v0 - v1) > threshold) { printf("[%d]: %f %f\n",i,v0,v1); ++diff_count; } } if (diff_count > 0) { printf("check fail: max_diff = %f, diff_count = %d\n", diff_max, diff_count); } else { printf("cpu check ok\n"); } } template __global__ static __launch_bounds__(decltype(size(TiledMma{}))::value) void gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler, TA const* A, AStride dA, ASmemLayout sA_layout, TiledCopyA copy_a, TB const* B, BStride dB, BSmemLayout sB_layout, TiledCopyB copy_b, TC * C, CStride dC, CSmemLayout , TiledMma mma, Alpha alpha, Beta beta) { using namespace hute; using X = Underscore; // Preconditions HUTE_STATIC_ASSERT_V(rank(shape_MNK) == Int<3>{}); // (M, N, K) HUTE_STATIC_ASSERT_V(rank(cta_tiler) == Int<3>{}); // (BLK_M, BLK_N, BLK_K) HUTE_STATIC_ASSERT_V(size(copy_a) == size(mma)); // NumThreads HUTE_STATIC_ASSERT_V(size(copy_b) == size(mma)); // NumThreads static_assert(is_static::value); static_assert(is_static::value); static_assert(is_static::value); HUTE_STATIC_ASSERT_V(size<0>(ASmemLayout{}) == size<0>(cta_tiler)); // BLK_M HUTE_STATIC_ASSERT_V(size<1>(CSmemLayout{}) == size<0>(cta_tiler)); // BLK_M HUTE_STATIC_ASSERT_V(size<0>(BSmemLayout{}) == size<1>(cta_tiler)); // BLK_N HUTE_STATIC_ASSERT_V(size<1>(CSmemLayout{}) == size<1>(cta_tiler)); // BLK_N HUTE_STATIC_ASSERT_V(size<1>(ASmemLayout{}) == size<2>(cta_tiler)); // BLK_K HUTE_STATIC_ASSERT_V(size<1>(BSmemLayout{}) == size<2>(cta_tiler)); // BLK_K HUTE_STATIC_ASSERT_V(congruent(select<0,2>(shape_MNK), dA)); // dA strides for shape MK HUTE_STATIC_ASSERT_V(congruent(select<1,2>(shape_MNK), dB)); // dB strides for shape NK HUTE_STATIC_ASSERT_V(congruent(select<0,1>(shape_MNK), dC)); // dC strides for shape MN // // Full and Tiled Tensors // // Represent the full tensors Tensor mA = make_tensor(make_gmem_ptr(A), select<0,2>(shape_MNK), dA); // (M,K) Tensor mB = make_tensor(make_gmem_ptr(B), select<1,2>(shape_MNK), dB); // (N,K) Tensor mC = make_tensor(make_gmem_ptr(C), select<0,1>(shape_MNK), dC); // (M,N) // Get the appropriate blocks for this thread block auto cta_coord = make_coord(blockIdx.x, blockIdx.y, _); // (m,n,k) Tensor gA = local_tile(mA, cta_tiler, cta_coord, Step<_1, X,_1>{}); // (BLK_M,BLK_K,k) Tensor gB = local_tile(mB, cta_tiler, cta_coord, Step< X,_1,_1>{}); // (BLK_N,BLK_K,k) Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{}); // (BLK_M,BLK_N) // Shared memory buffers __shared__ TA smemA[cosize_v]; __shared__ TB smemB[cosize_v]; Tensor sA = make_tensor(make_smem_ptr(smemA), sA_layout); // (BLK_M,BLK_K) Tensor sB = make_tensor(make_smem_ptr(smemB), sB_layout); // (BLK_N,BLK_K) // // Partition the copying of A and B tiles across the threads // // TUTORIAL: Example of partitioning via a TiledCopy ThrCopy thr_copy_a = copy_a.get_slice(threadIdx.x); Tensor tAgA = thr_copy_a.partition_S(gA); // (CPY,CPY_M,CPY_K,k) Tensor tAsA = thr_copy_a.partition_D(sA); // (CPY,CPY_M,CPY_K) Tensor tArA = make_fragment_like(tAsA); // (CPY,CPY_M,CPY_K) ThrCopy thr_copy_b = copy_b.get_slice(threadIdx.x); Tensor tBgB = thr_copy_b.partition_S(gB); // (CPY,CPY_N,CPY_K,k) Tensor tBsB = thr_copy_b.partition_D(sB); // (CPY,CPY_N,CPY_K) Tensor tBrB = make_fragment_like(tBsB); // (CPY,CPY_N,CPY_K) HUTE_STATIC_ASSERT_V(size<1>(tAgA) == size<1>(tAsA)); // CPY_M HUTE_STATIC_ASSERT_V(size<1>(tAgA) == size<1>(tArA)); // CPY_M HUTE_STATIC_ASSERT_V(size<2>(tAgA) == size<2>(tAsA)); // CPY_K HUTE_STATIC_ASSERT_V(size<2>(tAgA) == size<2>(tArA)); // CPY_K HUTE_STATIC_ASSERT_V(size<1>(tBgB) == size<1>(tBsB)); // CPY_N HUTE_STATIC_ASSERT_V(size<1>(tBgB) == size<1>(tBrB)); // CPY_N HUTE_STATIC_ASSERT_V(size<2>(tBgB) == size<2>(tBsB)); // CPY_K HUTE_STATIC_ASSERT_V(size<2>(tBgB) == size<2>(tBrB)); // CPY_K // Copy gmem to rmem for k_tile=0 copy(copy_a, tAgA(_,_,_,0), tArA); copy(copy_b, tBgB(_,_,_,0), tBrB); // // Define A/B partitioning and C accumulators // // TUTORIAL: Example of partitioning via a TiledMMA ThrMMA thr_mma = mma.get_slice(threadIdx.x); Tensor tCsA = thr_mma.partition_A(sA); // (MMA,MMA_M,MMA_K) Tensor tCsB = thr_mma.partition_B(sB); // (MMA,MMA_N,MMA_K) Tensor tCgC = thr_mma.partition_C(gC); // (MMA,MMA_M,MMA_N) // Allocate registers for pipelining Tensor tCrA = thr_mma.make_fragment_A(tCsA); // (MMA,MMA_M,MMA_K) Tensor tCrB = thr_mma.make_fragment_B(tCsB); // (MMA,MMA_N,MMA_K) // Allocate the accumulators -- same size as the projected data Tensor tCrC = thr_mma.make_fragment_C(tCgC); // (MMA,MMA_M,MMA_N) HUTE_STATIC_ASSERT_V( shape(tCrA) == shape(tCsA)); // (MMA,MMA_M,MMA_K) HUTE_STATIC_ASSERT_V( shape(tCrB) == shape(tCsB)); // (MMA,MMA_N,MMA_K) HUTE_STATIC_ASSERT_V( shape(tCrC) == shape(tCgC)); // (MMA,MMA_M,MMA_N) HUTE_STATIC_ASSERT_V(size<1>(tCgC) == size<1>(tCsA)); // MMA_M HUTE_STATIC_ASSERT_V(size<2>(tCgC) == size<1>(tCsB)); // MMA_N HUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB)); // MMA_K // Clear the accumulators clear(tCrC); #if 0 if(thread0()) { print(" mA : "); print( mA); print("\n"); print(" gA : "); print( gA); print("\n"); print(" sA : "); print( sA); print("\n"); print("tAgA : "); print(tAgA); print("\n"); print("tAsA : "); print(tAsA); print("\n"); print("tArA : "); print(tArA); print("\n"); } #endif #if 0 if(thread0()) { print(" mB : "); print( mB); print("\n"); print(" gB : "); print( gB); print("\n"); print(" sB : "); print( sB); print("\n"); print("tBgB : "); print(tBgB); print("\n"); print("tBsB : "); print(tBsB); print("\n"); print("tArA : "); print(tArA); print("\n"); } #endif #if 0 if(thread0()) { print(" mC : "); print( mC); print("\n"); print(" gC : "); print( gC); print("\n"); print("tCsA : "); print(tCsA); print("\n"); print("tCsB : "); print(tCsB); print("\n"); print("tCgC : "); print(tCgC); print("\n"); print("tCrC : "); print(tCrC); print("\n"); } #endif #if 1 // Copy rmem to smem copy(tArA, tAsA); copy(tBrB, tBsB); __syncthreads(); // // PIPELINED MAIN LOOP // TUTORIAL: Example of a gemm loop that pipelines shared memory AND register memory // Data is read from global to registers, then to shared via the tA|tB partitions // Data is then copied from shared to registers in multiple waves via the tC partitions // and gemm(.) operates on the current register wave // // Load A, B shmem->regs for k_block=0 copy(tCsA(_,_,0), tCrA(_,_,0)); copy(tCsB(_,_,0), tCrB(_,_,0)); auto K_TILE_MAX = size<3>(tAgA); auto K_BLOCK_MAX = size<2>(tCrA); HUTE_NO_UNROLL for (int k_tile = 0; k_tile < K_TILE_MAX; ++k_tile) { // Pipeline the k-mode of the block registers HUTE_UNROLL for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) { if (k_block == K_BLOCK_MAX - 1) { // Copy rmem to smem __syncthreads(); copy(tArA, tAsA); copy(tBrB, tBsB); __syncthreads(); } // Copy smem to rmem for k_block+1 int k_block_next = (k_block + 1) % K_BLOCK_MAX; copy(tCsA(_,_,k_block_next), tCrA(_,_,k_block_next)); copy(tCsB(_,_,k_block_next), tCrB(_,_,k_block_next)); if (k_block == 0) { // Copy gmem to rmem for k_tile+1 int k_tile_next = (k_tile + 1 < K_TILE_MAX) ? k_tile + 1 : k_tile; copy(copy_a, tAgA(_,_,_,k_tile_next), tArA); copy(copy_b, tBgB(_,_,_,k_tile_next), tBrB); } // Thread-level register gemm for k_block gemm(mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC); } // k_block } // k_tile #endif // // Epilogue // axpby(alpha, tCrC, beta, tCgC); } // Setup params for a NT GEMM template void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B, int ldB, Beta beta, TC * C, int ldC, hipStream_t stream = 0) { using namespace hute; // Define shapes (dynamic) auto M = int(m); auto N = int(n); auto K = int(k); auto prob_shape = make_shape(M, N, K); // (M, N, K) // Define NT strides (mixed) auto dA = make_stride(Int<1>{}, ldA); // (dM, dK) auto dB = make_stride(Int<1>{}, ldB); // (dN, dK) auto dC = make_stride(Int<1>{}, ldC); // (dM, dN) // Define CTA tile sizes (static) auto bM = Int<128>{}; auto bN = Int<128>{}; auto bK = Int< 8>{}; auto cta_tiler = make_shape(bM, bN, bK); // (BLK_M, BLK_N, BLK_K) // Define the smem layouts (static) auto sA = make_layout(make_shape(bM, bK)); // (m,k) -> smem_idx; m-major auto sB = make_layout(make_shape(bN, bK)); // (n,k) -> smem_idx; n-major auto sC = make_layout(make_shape(bM, bN)); // (m,n) -> smem_idx; m-major // Define the thread layouts (static) TiledCopy copyA = make_tiled_copy(Copy_Atom, TA>{}, Layout>{}, // Thr layout 32x8 m-major Layout>{}); // Val layout 4x1 m-major TiledCopy copyB = make_tiled_copy(Copy_Atom, TB>{}, Layout>{}, // Thr layout 32x8 n-major Layout>{}); // Val layout 4x1 n-major TiledMMA mmaC = make_tiled_mma(GFX928_16x16x8_F32F32F32F32_NT{}, Layout>{}, // Layout in Thr Layout>{} // Layout in Val ); #if 0 print(copyA); print(copyB); print(mmaC); #endif #if 0 print_latex(copyA); print_latex(copyB); print_latex(mmaC); #endif dim3 dimBlock(size(mmaC)); dim3 dimGrid(size(ceil_div(M, bM)), size(ceil_div(N, bN))); gemm_device<<>> (prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC, alpha, beta); } // Setup params for a TN GEMM template void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B, int ldB, Beta beta, TC * C, int ldC, hipStream_t stream = 0) { using namespace hute; // Define shapes (dynamic) auto M = int(m); auto N = int(n); auto K = int(k); auto prob_shape = make_shape(M, N, K); // (M, N, K) // Define TN strides (mixed) auto dA = make_stride(ldA, Int<1>{}); // (dM, dK) auto dB = make_stride(ldB, Int<1>{}); // (dN, dK) auto dC = make_stride(Int<1>{}, ldC); // (dM, dN) // Define CTA tile sizes (static) auto bM = Int<128>{}; auto bN = Int<128>{}; auto bK = Int< 8>{}; auto cta_tiler = make_shape(bM, bN, bK); // (BLK_M, BLK_N, BLK_K) // Define the smem layouts (static) auto sA = make_layout(make_shape ( bM, bK), make_stride(Int<1>{}, bM+Int<1>{})); // (m,k) -> smem_idx; padded m-major auto sB = make_layout(make_shape ( bN, bK), make_stride(Int<1>{}, bN+Int<1>{})); // (n,k) -> smem_idx; padded n-major auto sC = make_layout(make_shape(bM, bN)); // (m,n) -> smem_idx // Define the thread layouts (static) TiledCopy copyA = make_tiled_copy(Copy_Atom, TA>{}, Layout,Stride<_8,_1>>{}, // Thr layout 32x8 k-major Layout>{}); // Val layout 1x1 TiledCopy copyB = make_tiled_copy(Copy_Atom, TB>{}, Layout,Stride<_8,_1>>{}, // Thr layout 32x8 k-major Layout>{}); // Val layout 1x1 TiledMMA mmaC = make_tiled_mma(GFX928_16x16x8_F32F32F32F32_NT{}, Layout>{}, // Layout in Thr Layout>{} // Layout in Val ); // print_latex(tiled_mma); // // print_latex(mma_op); #if 0 print(copyA); print(copyB); print(mmaC); // printf("size(copyA)=%d\n",size(copyA)); // printf("size(copyB)=%d\n",size(copyB)); // printf("size(mmaC)=%d\n",size(mmaC)); #endif #if 0 print_latex(copyA); print_latex(copyB); print_latex(mmaC); #endif dim3 dimBlock(size(mmaC)); dim3 dimGrid(size(ceil_div(M, bM)), size(ceil_div(N, bN))); gemm_device<<>> (prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC, alpha, beta); } template void gemm(char transA, char transB, int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B, int ldB, Beta beta, TC * C, int ldC, hipStream_t stream = 0) { if (transA == 'N' && transB == 'T') { return gemm_nt(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream); } else if (transA == 'T' && transB == 'N') { return gemm_tn(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, stream); } assert(false && "Not implemented"); } int main(int argc, char** argv) { hipDeviceProp_t props; hipError_t error = hipGetDeviceProperties(&props, 0); if (error != hipSuccess) { std::cerr << "hipGetDeviceProperties() returned an error: " << hipGetErrorString(error) << std::endl; return -1; } int m = 2048; if (argc >= 2) sscanf(argv[1], "%d", &m); int n = 2048; if (argc >= 3) sscanf(argv[2], "%d", &n); int k = 1024; if (argc >= 4) sscanf(argv[3], "%d", &k); char transA = 'N'; if (argc >= 5) sscanf(argv[4], "%c", &transA); char transB = 'T'; if (argc >= 6) sscanf(argv[5], "%c", &transB); using TA = float; using TB = float; using TC = float; using TI = float; TI alpha = TI(1.0); TI beta = TI(0.0); std::cout << "M = " << m << std::endl; std::cout << "N = " << n << std::endl; std::cout << "K = " << k << std::endl; std::cout << "C = A^" << transA << " B^" << transB << std::endl; thrust::host_vector h_A(m*k); thrust::host_vector h_B(n*k); thrust::host_vector h_C(m*n); // for (int j = 0; j < m*k; ++j) h_A[j] = static_cast( j ); // for (int j = 0; j < n*k; ++j) h_B[j] = static_cast( j ); for (int j = 0; j < m*k; ++j) h_A[j] = static_cast( 2*(rand() / double(RAND_MAX)) - 1 ); for (int j = 0; j < n*k; ++j) h_B[j] = static_cast( 2*(rand() / double(RAND_MAX)) - 1 ); for (int j = 0; j < m*n; ++j) h_C[j] = static_cast(-1); thrust::device_vector d_A = h_A; thrust::device_vector d_B = h_B; thrust::device_vector d_C = h_C; double gflops = (2.0*m*n*k) * 1e-9; const int timing_iterations = 1; GPU_Clock timer; int ldA = 0, ldB = 0, ldC = m; if (transA == 'N') { ldA = m; } else if (transA == 'T') { ldA = k; } else { assert(false); } if (transB == 'N') { ldB = k; } else if (transB == 'T') { ldB = n; } else { assert(false); } #if defined(HYTLASS_ENABLE_HIPBLAS) && HYTLASS_ENABLE_HIPBLAS != 0 hipblasHandle_t handle; hipblasCreate(&handle); // Run once d_C = h_C; blam::hipblas::gemm(handle, transA == 'N' ? HIPBLAS_OP_N :HIPBLAS_OP_T, transB == 'N' ? HIPBLAS_OP_N :HIPBLAS_OP_T, m, n, k, &alpha, d_A.data().get(), ldA, d_B.data().get(), ldB, &beta, d_C.data().get(), m); HUTE_CHECK_LAST(); thrust::host_vector hipblas_result = d_C; // Timing iterations timer.start(); for (int i = 0; i < timing_iterations; ++i) { blam::hipblas::gemm(handle, transA == 'N' ? HIPBLAS_OP_N :HIPBLAS_OP_T, transB == 'N' ? HIPBLAS_OP_N :HIPBLAS_OP_T, m, n, k, &alpha, d_A.data().get(), ldA, d_B.data().get(), ldB, &beta, d_C.data().get(), m); } double hipblas_time = timer.seconds() / timing_iterations; HUTE_CHECK_LAST(); printf("BLAS_GEMM: [%6.1f]GFlop/s (%6.4f)ms\n", gflops / hipblas_time, hipblas_time*1000); #else std::cout << "Verification by comparison with hipBLAS is disabled, " "either because the CMake option HYTLASS_ENABLE_HIPBLAS " "was explicitly set to OFF, or because CMake could not find hipBLAS. " "If you would like to enable verification with hipBLAS, " "please set the CMake option HYTLASS_ENABLE_HIPBLAS to ON, " "rerun CMake, and recompile this example.\n"; #endif // HYTLASS_ENABLE_HIPBLAS // Run once d_C = h_C; gemm(transA, transB, m, n, k, alpha, d_A.data().get(), ldA, d_B.data().get(), ldB, beta, d_C.data().get(), ldC); HUTE_CHECK_LAST(); thrust::host_vector hute_result = d_C; if(transA == 'N' && transB == 'T') { using namespace hute; // tn TI * Dptr_host_cpu = (TI *)malloc(sizeof(TI) * m * n); TI * Dptr_host = (TI *)malloc(sizeof(TI) * m * n); TI * Dptr_host_A = &h_A[0]; TI * Dptr_host_B = &h_B[0]; auto tD_host_cpu = make_tensor(Dptr_host_cpu, make_shape(m, n), make_stride(1, m)); auto tA = make_tensor(Dptr_host_A, make_shape(m, k), make_stride(1, m)); auto tB = make_tensor(Dptr_host_B, make_shape(n, k), make_stride(1, n)); cpu_gemm(&tD_host_cpu, tA, tB); (void)hipMemcpy(Dptr_host, d_C.data().get(), sizeof(TI) * m * n, hipMemcpyDeviceToHost); auto tD_host_hute = make_tensor(Dptr_host, make_shape(m, n), make_stride(1, m)); cpu_compare(tD_host_hute,tD_host_cpu,0.1); } else if(transA == 'T' && transB == 'N') { using namespace hute; TI * Dptr_host_cpu = (TI *)malloc(sizeof(TI) * m * n); TI * Dptr_host = (TI *)malloc(sizeof(TI) * m * n); TI * Dptr_host_A = &h_A[0]; TI * Dptr_host_B = &h_B[0]; auto tD_host_cpu = make_tensor(Dptr_host_cpu, make_shape(m, n), make_stride(1, m)); auto tA = make_tensor(Dptr_host_A, make_shape(m, k), make_stride(k,1)); auto tB = make_tensor(Dptr_host_B, make_shape(n, k), make_stride(k,1)); cpu_gemm(&tD_host_cpu, tA, tB); (void)hipMemcpy(Dptr_host, d_C.data().get(), sizeof(TI) * m * n, hipMemcpyDeviceToHost); auto tD_host_hute = make_tensor(Dptr_host, make_shape(m, n), make_stride(1, m)); cpu_compare(tD_host_hute,tD_host_cpu,0.1); } // Timing iterations timer.start(); for (int i = 0; i < timing_iterations; ++i) { gemm(transA, transB, m, n, k, alpha, d_A.data().get(), ldA, d_B.data().get(), ldB, beta, d_C.data().get(), ldC); } double hute_time = timer.seconds() / timing_iterations; HUTE_CHECK_LAST(); printf("HUTE_GEMM: [%6.1f]GFlop/s (%6.4f)ms\n", gflops / hute_time, hute_time*1000); #if defined(HYTLASS_ENABLE_HIPBLAS) && HYTLASS_ENABLE_HIPBLAS != 0 printf("Empirical Perf: %.1f%%\n", (hipblas_time / hute_time) * 100); auto host_matrix_to_const_column_major_hute_tensor = [](const auto& X, int num_rows, int num_cols, int LDX) { const auto shape = hute::Shape{num_rows, num_cols}; const auto strides = hute::Stride{1, LDX}; return hute::make_tensor(X.data(), hute::make_layout(shape, strides)); }; const auto A_view = host_matrix_to_const_column_major_hute_tensor(h_A, m, k, m); // B^T is k x n, so B is n x k. const auto B_view = host_matrix_to_const_column_major_hute_tensor(h_B, n, k, n); const auto C_computed_view = host_matrix_to_const_column_major_hute_tensor(hute_result, m, n, m); const auto C_expected_view = host_matrix_to_const_column_major_hute_tensor(hipblas_result, m, n, m); print_matrix_multiply_mollified_relative_error("float", A_view, B_view, C_computed_view, C_expected_view); #endif // HYTLASS_ENABLE_HIPBLAS return 0; }