/*************************************************************************************************** * Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ #pragma once #include #include namespace hute { /////////////////////////////////////v_mmac_f32_f32///////////////////////////////////// struct GFX928_16x16x8_F32F32F32F32_NT { using DRegisters = float[4]; using ARegisters = float[2]; using BRegisters = float[2]; using CRegisters = float[4]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float const& a0, float const& a1, float const& b0, float const& b1, float const& c0, float const& c1, float const& c2, float const& c3) { // printf("a:%f %f b:%f %f\n",a0,a1,b0,b1); #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f c; v4f d; c.x = c0; c.y = c1; c.z = c2; c.w = c3; v2f a; v2f b; a.x = a0; a.y = a1; b.x = b0; b.y = b1; d = __builtin_hcu_mmac_f32_16x16x8f32(a, b, c); d0 = d.x; d1 = d.y; d2 = d.z; d3 = d.w; #endif } }; /////////////////////////////////////v_mmac_f32_tf32///////////////////////////////////// struct GFX928_16x16x8_F32TF32TF32F32_NT { using DRegisters = float[4]; using ARegisters = float[2]; using BRegisters = float[2]; using CRegisters = float[4]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float const& a0, float const& a1, float const& b0, float const& b1, float const& c0, float const& c1, float const& c2, float const& c3) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f c; v4f d; c.x = c0; c.y = c1; c.z = c2; c.w = c3; v2f a; v2f b; a.x = a0; a.y = a1; b.x = b0; b.y = b1; d = __builtin_hcu_mmac_f32_16x16x8tf32(a, b, c); d0 = d.x; d1 = d.y; d2 = d.z; d3 = d.w; #endif } }; /////////////////////////////////////v_mmac_f32_f16///////////////////////////////////// struct GFX928_16x16x16_F32F16F16F32_NT { using DRegisters = float[4]; using ARegisters = half_t[4]; using BRegisters = half_t[4]; using CRegisters = float[4]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, half_t const& a0, half_t const& a1,half_t const& a2, half_t const& a3, half_t const& b0, half_t const& b1,half_t const& b2, half_t const& b3, float const& c0, float const& c1, float const& c2, float const& c3) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f c; v4f d; c.x = c0; c.y = c1; c.z = c2; c.w = c3; __fp16x4_t A,B; A.x = a0; A.y = a1; A.z = a2; A.w = a3; B.x = b0; B.y = b1; B.z = b2; B.w = b3; d = __builtin_hcu_mmac_f32_16x16x16f16(A,B,c); d0 = d.x; d1 = d.y; d2 = d.z; d3 = d.w; #endif } }; struct GFX928_16x16x16_F32F16F16F32_NT_FOR_GEMM1 { using DRegisters = float[4]; using ARegisters = half_t[4]; using BRegisters = half_t[4]; using CRegisters = float[4]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, half_t const& a0, half_t const& a1,half_t const& a2, half_t const& a3, half_t const& b0, half_t const& b1,half_t const& b2, half_t const& b3, float const& c0, float const& c1, float const& c2, float const& c3) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f c; v4f d; c.x = c0; c.y = c1; c.z = c2; c.w = c3; __fp16x4_t A,B; A.x = a0; A.y = a1; A.z = a2; A.w = a3; B.x = b0; B.y = b1; B.z = b2; B.w = b3; d = __builtin_hcu_mmac_f32_16x16x16f16(A,B,c); d0 = d.x; d1 = d.y; d2 = d.z; d3 = d.w; #endif } }; struct GFX928_16x32x16_F32F16F16F32_NT { using DRegisters = float[8]; using ARegisters = half_t[4]; using BRegisters = half_t[8]; using CRegisters = float[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float & d4, float & d5, float & d6, float & d7, half_t const& a0, half_t const& a1,half_t const& a2, half_t const& a3, half_t const& b0, half_t const& b1,half_t const& b2, half_t const& b3, half_t const& b4, half_t const& b5,half_t const& b6, half_t const& b7, float const& c0, float const& c1, float const& c2, float const& c3, float const& c4, float const& c5, float const& c6, float const& c7) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f C0,C1; v4f D0,D1; C0.x = c0; C0.y = c1; C0.z = c2; C0.w = c3; C1.x = c4; C1.y = c5; C1.z = c6; C1.w = c7; __fp16x4_t A, B, B0; A.x = a0; A.y = a1; A.z = a2; A.w = a3; B.x = b0; B.y = b1; B.z = b2; B.w = b3; B0.x = b4; B0.y = b5; B0.z = b6; B0.w = b7; D0 = __builtin_hcu_mmac_f32_16x16x16f16(A,B,C0); D1 = __builtin_hcu_mmac_f32_16x16x16f16(A,B0,C1); d0 = D0.x; d1 = D0.y; d2 = D0.z; d3 = D0.w; d4 = D1.x; d5 = D1.y; d6 = D1.z; d7 = D1.w; #endif } }; struct GFX928_32x16x16_F32F16F16F32_NT { using DRegisters = float[8]; using ARegisters = half_t[8]; using BRegisters = half_t[4]; using CRegisters = float[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float & d4, float & d5, float & d6, float & d7, half_t const& a0, half_t const& a1,half_t const& a2, half_t const& a3, half_t const& a4, half_t const& a5,half_t const& a6, half_t const& a7, half_t const& b0, half_t const& b1,half_t const& b2, half_t const& b3, float const& c0, float const& c1, float const& c2, float const& c3, float const& c4, float const& c5, float const& c6, float const& c7) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f C0,C1; v4f D0,D1; C0.x = c0; C0.y = c1; C0.z = c2; C0.w = c3; C1.x = c4; C1.y = c5; C1.z = c6; C1.w = c7; __fp16x4_t A,A0,B; A.x = a0; A.y = a1; A.z = a2; A.w = a3; A0.x = a4; A0.y = a5; A0.z = a6; A0.w = a7; B.x = b0; B.y = b1; B.z = b2; B.w = b3; D0 = __builtin_hcu_mmac_f32_16x16x16f16(A,B,C0); D1 = __builtin_hcu_mmac_f32_16x16x16f16(A0,B,C1); d0 = D0.x; d1 = D0.y; d2 = D0.z; d3 = D0.w; d4 = D1.x; d5 = D1.y; d6 = D1.z; d7 = D1.w; #endif } }; struct GFX928_32x32x16_F32F16F16F32_NT { using DRegisters = float[16]; using ARegisters = half_t[8]; using BRegisters = half_t[8]; using CRegisters = float[16]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float & d4, float & d5, float & d6, float & d7, float & d8, float & d9, float & d10, float & d11, float & d12, float & d13, float & d14, float & d15, half_t const& a0, half_t const& a1,half_t const& a2, half_t const& a3, half_t const& a4, half_t const& a5,half_t const& a6, half_t const& a7, half_t const& b0, half_t const& b1,half_t const& b2, half_t const& b3, half_t const& b4, half_t const& b5,half_t const& b6, half_t const& b7, float const& c0, float const& c1, float const& c2, float const& c3, float const& c4, float const& c5, float const& c6, float const& c7, float const& c8, float const& c9, float const& c10, float const& c11, float const& c12, float const& c13, float const& c14, float const& c15) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f C0,C1,C2,C3; v4f D0,D1,D2,D3; C0.x = c0; C0.y = c1; C0.z = c2; C0.w = c3; C1.x = c4; C1.y = c5; C1.z = c6; C1.w = c7; C2.x = c8; C2.y = c9; C2.z = c10; C2.w = c11; C3.x = c12; C3.y = c13; C3.z = c14; C3.w = c15; __fp16x4_t A,B,A0,B0; A.x = a0; A.y = a1; A.z = a2; A.w = a3; A0.x = a4; A0.y = a5; A0.z = a6; A0.w = a7; B.x = b0; B.y = b1; B.z = b2; B.w = b3; B0.x = b4; B0.y = b5; B0.z = b6; B0.w = b7; D0 = __builtin_hcu_mmac_f32_16x16x16f16(A,B,C0); D1 = __builtin_hcu_mmac_f32_16x16x16f16(A,B0,C1); D2 = __builtin_hcu_mmac_f32_16x16x16f16(A0,B,C2); D3 = __builtin_hcu_mmac_f32_16x16x16f16(A0,B0,C3); d0 = D0.x; d1 = D0.y; d2 = D0.z; d3 = D0.w; d4 = D1.x; d5 = D1.y; d6 = D1.z; d7 = D1.w; d8 = D2.x; d9 = D2.y; d10 = D2.z; d11 = D2.w; d12 = D3.x; d13 = D3.y; d14 = D3.z; d15 = D3.w; #endif } }; struct GFX928_32x32x16_F32F16F16F32_NT_ALT { using DRegisters = float[16]; using ARegisters = half_t[8]; using BRegisters = half_t[8]; using CRegisters = float[16]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float & d4, float & d5, float & d6, float & d7, float & d8, float & d9, float & d10, float & d11, float & d12, float & d13, float & d14, float & d15, half_t const& a0, half_t const& a1,half_t const& a2, half_t const& a3, half_t const& a4, half_t const& a5,half_t const& a6, half_t const& a7, half_t const& b0, half_t const& b1,half_t const& b2, half_t const& b3, half_t const& b4, half_t const& b5,half_t const& b6, half_t const& b7, float const& c0, float const& c1, float const& c2, float const& c3, float const& c4, float const& c5, float const& c6, float const& c7, float const& c8, float const& c9, float const& c10, float const& c11, float const& c12, float const& c13, float const& c14, float const& c15) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f C0,C1,C2,C3; v4f D0,D1,D2,D3; C0.x = c0; C0.y = c1; C0.z = c2; C0.w = c3; C1.x = c4; C1.y = c5; C1.z = c6; C1.w = c7; C2.x = c8; C2.y = c9; C2.z = c10; C2.w = c11; C3.x = c12; C3.y = c13; C3.z = c14; C3.w = c15; __fp16x4_t A,B,A0,B0; A.x = a0; A.y = a1; A.z = a2; A.w = a3; A0.x = a4; A0.y = a5; A0.z = a6; A0.w = a7; B.x = b0; B.y = b1; B.z = b2; B.w = b3; B0.x = b4; B0.y = b5; B0.z = b6; B0.w = b7; D0 = __builtin_hcu_mmac_f32_16x16x16f16(A,B,C0); D1 = __builtin_hcu_mmac_f32_16x16x16f16(A,B0,C1); D2 = __builtin_hcu_mmac_f32_16x16x16f16(A0,B,C2); D3 = __builtin_hcu_mmac_f32_16x16x16f16(A0,B0,C3); // asm volatile("v_mmac_f32_16x16x16_f16 %0, %1, %2, %3\n\t" // : "+v"(D0) // : "v"(A), "v"(B), "v"(C0)); // asm volatile("v_mmac_f32_16x16x16_f16 %0, %1, %2, %3\n\t" // : "+v"(D1) // : "v"(A), "v"(B0), "v"(C1)); // asm volatile("v_mmac_f32_16x16x16_f16 %0, %1, %2, %3\n\t" // : "+v"(D2) // : "v"(A0), "v"(B), "v"(C2)); // asm volatile("v_mmac_f32_16x16x16_f16 %0, %1, %2, %3\n\t" // : "+v"(D3) // : "v"(A0), "v"(B0), "v"(C3)); d0 = D0.x; d1 = D0.y; d2 = D0.z; d3 = D0.w; d4 = D1.x; d5 = D1.y; d6 = D1.z; d7 = D1.w; d8 = D2.x; d9 = D2.y; d10 = D2.z; d11 = D2.w; d12 = D3.x; d13 = D3.y; d14 = D3.z; d15 = D3.w; #endif } }; //////////////////////////// v_mmac_16x16x16 concatenate to depthx32 ///////////////////////////////////////////// struct GFX928_16x16x32_F32F16F16F32_NT { using DRegisters = float[4]; using ARegisters = half_t[8]; using BRegisters = half_t[8]; using CRegisters = float[4]; // Register asm fma HUTE_HOST_DEVICE static void fma(float &d0, float &d1, float &d2, float &d3, half_t const &a0, half_t const &a1, half_t const &a2, half_t const &a3, half_t const &a4, half_t const &a5, half_t const &a6, half_t const &a7, half_t const &b0, half_t const &b1, half_t const &b2, half_t const &b3, half_t const &b4, half_t const &b5, half_t const &b6, half_t const &b7, float const &c0, float const &c1, float const &c2, float const &c3) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f c; v4f d; c.x = c0; c.y = c1; c.z = c2; c.w = c3; __fp16x4_t A,B; A.x = a0; A.y = a1; A.z = a2; A.w = a3; B.x = b0; B.y = b1; B.z = b2; B.w = b3; d = __builtin_hcu_mmac_f32_16x16x16f16(A,B,c); A.x = a4; A.y = a5; A.z = a6; A.w = a7; B.x = b4; B.y = b5; B.z = b6; B.w = b7; d = __builtin_hcu_mmac_f32_16x16x16f16(A,B,d); d0 = d.x; d1 = d.y; d2 = d.z; d3 = d.w; #endif } }; struct GFX928_16x32x32_F32F16F16F32_NT { using DRegisters = float[8]; using ARegisters = half_t[8]; using BRegisters = half_t[16]; using CRegisters = float[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float & d4, float & d5, float & d6, float & d7, half_t const& a0, half_t const& a1,half_t const& a2, half_t const& a3, half_t const& a4, half_t const& a5,half_t const& a6, half_t const& a7, half_t const& b0, half_t const& b1,half_t const& b2, half_t const& b3, half_t const& b4, half_t const& b5,half_t const& b6, half_t const& b7, half_t const& b8, half_t const& b9,half_t const& b10, half_t const& b11, half_t const& b12, half_t const& b13,half_t const& b14, half_t const& b15, float const& c0, float const& c1, float const& c2, float const& c3, float const& c4, float const& c5, float const& c6, float const& c7) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f C0,C1; v4f D0,D1; C0.x = c0; C0.y = c1; C0.z = c2; C0.w = c3; C1.x = c4; C1.y = c5; C1.z = c6; C1.w = c7; __fp16x4_t A,B,B0; A.x = a0; A.y = a1; A.z = a2; A.w = a3; B.x = b0; B.y = b1; B.z = b2; B.w = b3; B0.x = b8; B0.y = b9; B0.z = b10; B0.w = b11; D0 = __builtin_hcu_mmac_f32_16x16x16f16(A,B,C0); D1 = __builtin_hcu_mmac_f32_16x16x16f16(A,B0,C1); A.x = a4; A.y = a5; A.z = a6; A.w = a7; B.x = b4; B.y = b5; B.z = b6; B.w = b7; B0.x = b12; B0.y = b13; B0.z = b14; B0.w = b15; D0 = __builtin_hcu_mmac_f32_16x16x16f16(A,B,D0); D1 = __builtin_hcu_mmac_f32_16x16x16f16(A,B0,D1); d0 = D0.x; d1 = D0.y; d2 = D0.z; d3 = D0.w; d4 = D1.x; d5 = D1.y; d6 = D1.z; d7 = D1.w; #endif } }; struct GFX928_32x16x32_F32F16F16F32_NT { using DRegisters = float[8]; using ARegisters = half_t[16]; using BRegisters = half_t[8]; using CRegisters = float[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float & d4, float & d5, float & d6, float & d7, half_t const& a0, half_t const& a1,half_t const& a2, half_t const& a3, half_t const& a4, half_t const& a5,half_t const& a6, half_t const& a7, half_t const& a8, half_t const& a9,half_t const& a10, half_t const& a11, half_t const& a12, half_t const& a13,half_t const& a14, half_t const& a15, half_t const& b0, half_t const& b1,half_t const& b2, half_t const& b3, half_t const& b4, half_t const& b5,half_t const& b6, half_t const& b7, float const& c0, float const& c1, float const& c2, float const& c3, float const& c4, float const& c5, float const& c6, float const& c7) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f C0,C1; v4f D0,D1; C0.x = c0; C0.y = c1; C0.z = c2; C0.w = c3; C1.x = c4; C1.y = c5; C1.z = c6; C1.w = c7; __fp16x4_t A, A0, B; A.x = a0; A.y = a1; A.z = a2; A.w = a3; A0.x = a8; A0.y = a9; A0.z = a10; A0.w = a11; B.x = b0; B.y = b1; B.z = b2; B.w = b3; D0 = __builtin_hcu_mmac_f32_16x16x16f16(A,B,C0); D1 = __builtin_hcu_mmac_f32_16x16x16f16(A0,B,C1); A.x = a4; A.y = a5; A.z = a6; A.w = a7; A0.x = a12; A0.y = a13; A0.z = a14; A0.w = a15; B.x = b4; B.y = b5; B.z = b6; B.w = b7; D0 = __builtin_hcu_mmac_f32_16x16x16f16(A,B,D0); D1 = __builtin_hcu_mmac_f32_16x16x16f16(A0,B,D1); d0 = D0.x; d1 = D0.y; d2 = D0.z; d3 = D0.w; d4 = D1.x; d5 = D1.y; d6 = D1.z; d7 = D1.w; #endif } }; struct GFX928_32x32x32_F32F16F16F32_NT { using DRegisters = float[16]; using ARegisters = half_t[16]; using BRegisters = half_t[16]; using CRegisters = float[16]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float & d4, float & d5, float & d6, float & d7, float & d8, float & d9, float & d10, float & d11, float & d12, float & d13, float & d14, float & d15, half_t const& a0, half_t const& a1,half_t const& a2, half_t const& a3, half_t const& a4, half_t const& a5,half_t const& a6, half_t const& a7, half_t const& a8, half_t const& a9,half_t const& a10, half_t const& a11, half_t const& a12, half_t const& a13,half_t const& a14, half_t const& a15, half_t const& b0, half_t const& b1,half_t const& b2, half_t const& b3, half_t const& b4, half_t const& b5,half_t const& b6, half_t const& b7, half_t const& b8, half_t const& b9,half_t const& b10, half_t const& b11, half_t const& b12, half_t const& b13,half_t const& b14, half_t const& b15, float const& c0, float const& c1, float const& c2, float const& c3, float const& c4, float const& c5, float const& c6, float const& c7, float const& c8, float const& c9, float const& c10, float const& c11, float const& c12, float const& c13, float const& c14, float const& c15) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f C0,C1,C2,C3; v4f D0,D1,D2,D3; C0.x = c0; C0.y = c1; C0.z = c2; C0.w = c3; C1.x = c4; C1.y = c5; C1.z = c6; C1.w = c7; C2.x = c8; C2.y = c9; C2.z = c10; C2.w = c11; C3.x = c12; C3.y = c13; C3.z = c14; C3.w = c15; __fp16x4_t A,B,A0,B0; A.x = a0; A.y = a1; A.z = a2; A.w = a3; A0.x = a8; A0.y = a9; A0.z = a10; A0.w = a11; B.x = b0; B.y = b1; B.z = b2; B.w = b3; B0.x = b8; B0.y = b9; B0.z = b10; B0.w = b11; D0 = __builtin_hcu_mmac_f32_16x16x16f16(A,B,C0); D1 = __builtin_hcu_mmac_f32_16x16x16f16(A,B0,C1); D2 = __builtin_hcu_mmac_f32_16x16x16f16(A0,B,C2); D3 = __builtin_hcu_mmac_f32_16x16x16f16(A0,B0,C3); A.x = a4; A.y = a5; A.z = a6; A.w = a7; A0.x = a12; A0.y = a13; A0.z = a14; A0.w = a15; B.x = b4; B.y = b5; B.z = b6; B.w = b7; B0.x = b12; B0.y = b13; B0.z = b14; B0.w = b15; D0 = __builtin_hcu_mmac_f32_16x16x16f16(A,B,D0); D1 = __builtin_hcu_mmac_f32_16x16x16f16(A,B0,D1); D2 = __builtin_hcu_mmac_f32_16x16x16f16(A0,B,D2); D3 = __builtin_hcu_mmac_f32_16x16x16f16(A0,B0,D3); d0 = D0.x; d1 = D0.y; d2 = D0.z; d3 = D0.w; d4 = D1.x; d5 = D1.y; d6 = D1.z; d7 = D1.w; d8 = D2.x; d9 = D2.y; d10 = D2.z; d11 = D2.w; d12 = D3.x; d13 = D3.y; d14 = D3.z; d15 = D3.w; #endif } }; /////////////////////////////////////v_mmac_f32_bf16///////////////////////////////////// struct GFX928_16x16x16_F32BF16BF16F32_NT { using DRegisters = float[4]; using ARegisters = bfloat16_t[4]; using BRegisters = bfloat16_t[4]; using CRegisters = float[4]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, bfloat16_t const& a0, bfloat16_t const& a1,bfloat16_t const& a2, bfloat16_t const& a3, bfloat16_t const& b0, bfloat16_t const& b1,bfloat16_t const& b2, bfloat16_t const& b3, float const& c0, float const& c1, float const& c2, float const& c3) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f c; v4f d; c.x = c0; c.y = c1; c.z = c2; c.w = c3; hytlass::Array a; a[0] = a0; a[1] = a1; a[2] = a2; a[3] = a3; hytlass::Array b; b[0] = b0; b[1] = b1; b[2] = b2; b[3] = b3; __bf16x4_t A,B; A = *(reinterpret_cast<__bf16x4_t *>(&a)); B = *(reinterpret_cast<__bf16x4_t *>(&b)); d = __builtin_hcu_mmac_f32_16x16x16bf16(A, B, c); d0 = d.x; d1 = d.y; d2 = d.z; d3 = d.w; #endif } }; struct GFX928_16x16x16_F32BF16BF16F32_NT_FOR_GEMM1 { using DRegisters = float[4]; using ARegisters = bfloat16_t[4]; using BRegisters = bfloat16_t[4]; using CRegisters = float[4]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, bfloat16_t const& a0, bfloat16_t const& a1,bfloat16_t const& a2, bfloat16_t const& a3, bfloat16_t const& b0, bfloat16_t const& b1,bfloat16_t const& b2, bfloat16_t const& b3, float const& c0, float const& c1, float const& c2, float const& c3) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f c; v4f d; c.x = c0; c.y = c1; c.z = c2; c.w = c3; hytlass::Array a; a[0] = a0; a[1] = a1; a[2] = a2; a[3] = a3; hytlass::Array b; b[0] = b0; b[1] = b1; b[2] = b2; b[3] = b3; __bf16x4_t A,B; A = *(reinterpret_cast<__bf16x4_t *>(&a)); B = *(reinterpret_cast<__bf16x4_t *>(&b)); d = __builtin_hcu_mmac_f32_16x16x16bf16(A, B, c); d0 = d.x; d1 = d.y; d2 = d.z; d3 = d.w; #endif } }; struct GFX928_16x32x16_F32BF16BF16F32_NT { using DRegisters = float[8]; using ARegisters = bfloat16_t[4]; using BRegisters = bfloat16_t[8]; using CRegisters = float[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float & d4, float & d5, float & d6, float & d7, bfloat16_t const& a0, bfloat16_t const& a1,bfloat16_t const& a2, bfloat16_t const& a3, bfloat16_t const& b0, bfloat16_t const& b1,bfloat16_t const& b2, bfloat16_t const& b3, bfloat16_t const& b4, bfloat16_t const& b5,bfloat16_t const& b6, bfloat16_t const& b7, float const& c0, float const& c1, float const& c2, float const& c3, float const& c4, float const& c5, float const& c6, float const& c7) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f C0,C1; v4f D0,D1; C0.x = c0; C0.y = c1; C0.z = c2; C0.w = c3; C1.x = c4; C1.y = c5; C1.z = c6; C1.w = c7; hytlass::Array array_a; hytlass::Array array_b0, array_b1; array_a[0] = a0; array_a[1] = a1; array_a[2] = a2; array_a[3] = a3; array_b0[0] = b0; array_b0[1] = b1; array_b0[2] = b2; array_b0[3] = b3; array_b1[0] = b4; array_b1[1] = b5; array_b1[2] = b6; array_b1[3] = b7; __bf16x4_t A,B,B0; A = *reinterpret_cast<__bf16x4_t*>(&array_a); B = *reinterpret_cast<__bf16x4_t*>(&array_b0); B0 = *reinterpret_cast<__bf16x4_t*>(&array_b1); D0 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B,C0); D1 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B0,C1); d0 = D0.x; d1 = D0.y; d2 = D0.z; d3 = D0.w; d4 = D1.x; d5 = D1.y; d6 = D1.z; d7 = D1.w; #endif } }; struct GFX928_32x16x16_F32BF16BF16F32_NT { using DRegisters = float[8]; using ARegisters = bfloat16_t[8]; using BRegisters = bfloat16_t[4]; using CRegisters = float[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float & d4, float & d5, float & d6, float & d7, bfloat16_t const& a0, bfloat16_t const& a1,bfloat16_t const& a2, bfloat16_t const& a3, bfloat16_t const& a4, bfloat16_t const& a5,bfloat16_t const& a6, bfloat16_t const& a7, bfloat16_t const& b0, bfloat16_t const& b1,bfloat16_t const& b2, bfloat16_t const& b3, float const& c0, float const& c1, float const& c2, float const& c3, float const& c4, float const& c5, float const& c6, float const& c7) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f C0,C1; v4f D0,D1; C0.x = c0; C0.y = c1; C0.z = c2; C0.w = c3; C1.x = c4; C1.y = c5; C1.z = c6; C1.w = c7; hytlass::Array array_a0, array_a1; hytlass::Array array_b; array_a0[0] = a0; array_a0[1] = a1; array_a0[2] = a2; array_a0[3] = a3; array_a1[0] = a4; array_a1[1] = a5; array_a1[2] = a6; array_a1[3] = a7; array_b[0] = b0; array_b[1] = b1; array_b[2] = b2; array_b[3] = b3; __bf16x4_t A, A0, B; A = *reinterpret_cast<__bf16x4_t*>(&array_a0); A0 = *reinterpret_cast<__bf16x4_t*>(&array_a1); B = *reinterpret_cast<__bf16x4_t*>(&array_b); D0 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B,C0); D1 = __builtin_hcu_mmac_f32_16x16x16bf16(A0,B,C1); d0 = D0.x; d1 = D0.y; d2 = D0.z; d3 = D0.w; d4 = D1.x; d5 = D1.y; d6 = D1.z; d7 = D1.w; #endif } }; struct GFX928_32x32x16_F32BF16BF16F32_NT { using DRegisters = float[16]; using ARegisters = bfloat16_t[8]; using BRegisters = bfloat16_t[8]; using CRegisters = float[16]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float & d4, float & d5, float & d6, float & d7, float & d8, float & d9, float & d10, float & d11, float & d12, float & d13, float & d14, float & d15, bfloat16_t const& a0, bfloat16_t const& a1, bfloat16_t const& a2, bfloat16_t const& a3, bfloat16_t const& a4, bfloat16_t const& a5, bfloat16_t const& a6, bfloat16_t const& a7, bfloat16_t const& b0, bfloat16_t const& b1, bfloat16_t const& b2, bfloat16_t const& b3, bfloat16_t const& b4, bfloat16_t const& b5, bfloat16_t const& b6, bfloat16_t const& b7, float const& c0, float const& c1, float const& c2, float const& c3, float const& c4, float const& c5, float const& c6, float const& c7, float const& c8, float const& c9, float const& c10, float const& c11, float const& c12, float const& c13, float const& c14, float const& c15) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f C0,C1,C2,C3; v4f D0,D1,D2,D3; C0.x = c0; C0.y = c1; C0.z = c2; C0.w = c3; C1.x = c4; C1.y = c5; C1.z = c6; C1.w = c7; C2.x = c8; C2.y = c9; C2.z = c10; C2.w = c11; C3.x = c12; C3.y = c13; C3.z = c14; C3.w = c15; hytlass::Array array_a0, array_a1; hytlass::Array array_b0, array_b1; array_a0[0] = a0; array_a0[1] = a1; array_a0[2] = a2; array_a0[3] = a3; array_a1[0] = a4; array_a1[1] = a5; array_a1[2] = a6; array_a1[3] = a7; array_b0[0] = b0; array_b0[1] = b1; array_b0[2] = b2; array_b0[3] = b3; array_b1[0] = b4; array_b1[1] = b5; array_b1[2] = b6; array_b1[3] = b7; __bf16x4_t A,B,A0,B0; A = *reinterpret_cast<__bf16x4_t*>(&array_a0); A0 = *reinterpret_cast<__bf16x4_t*>(&array_a1); B = *reinterpret_cast<__bf16x4_t*>(&array_b0); B0 = *reinterpret_cast<__bf16x4_t*>(&array_b1); D0 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B,C0); D1 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B0,C1); D2 = __builtin_hcu_mmac_f32_16x16x16bf16(A0,B,C2); D3 = __builtin_hcu_mmac_f32_16x16x16bf16(A0,B0,C3); d0 = D0.x; d1 = D0.y; d2 = D0.z; d3 = D0.w; d4 = D1.x; d5 = D1.y; d6 = D1.z; d7 = D1.w; d8 = D2.x; d9 = D2.y; d10 = D2.z; d11 = D2.w; d12 = D3.x; d13 = D3.y; d14 = D3.z; d15 = D3.w; #endif } }; struct GFX928_32x32x16_F32BF16BF16F32_NT_ALT { using DRegisters = float[16]; using ARegisters = bfloat16_t[8]; using BRegisters = bfloat16_t[8]; using CRegisters = float[16]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float & d4, float & d5, float & d6, float & d7, float & d8, float & d9, float & d10, float & d11, float & d12, float & d13, float & d14, float & d15, bfloat16_t const& a0, bfloat16_t const& a1,bfloat16_t const& a2, bfloat16_t const& a3, bfloat16_t const& a4, bfloat16_t const& a5,bfloat16_t const& a6, bfloat16_t const& a7, bfloat16_t const& b0, bfloat16_t const& b1,bfloat16_t const& b2, bfloat16_t const& b3, bfloat16_t const& b4, bfloat16_t const& b5,bfloat16_t const& b6, bfloat16_t const& b7, float const& c0, float const& c1, float const& c2, float const& c3, float const& c4, float const& c5, float const& c6, float const& c7, float const& c8, float const& c9, float const& c10, float const& c11, float const& c12, float const& c13, float const& c14, float const& c15) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f C0,C1,C2,C3; v4f D0,D1,D2,D3; C0.x = c0; C0.y = c1; C0.z = c2; C0.w = c3; C1.x = c4; C1.y = c5; C1.z = c6; C1.w = c7; C2.x = c8; C2.y = c9; C2.z = c10; C2.w = c11; C3.x = c12; C3.y = c13; C3.z = c14; C3.w = c15; hytlass::Array array_a0, array_a1; hytlass::Array array_b0, array_b1; array_a0[0] = a0; array_a0[1] = a1; array_a0[2] = a2; array_a0[3] = a3; array_a1[0] = a4; array_a1[1] = a5; array_a1[2] = a6; array_a1[3] = a7; array_b0[0] = b0; array_b0[1] = b1; array_b0[2] = b2; array_b0[3] = b3; array_b1[0] = b4; array_b1[1] = b5; array_b1[2] = b6; array_b1[3] = b7; __bf16x4_t A,B,A0,B0; A = *reinterpret_cast<__bf16x4_t*>(&array_a0); A0 = *reinterpret_cast<__bf16x4_t*>(&array_a1); B = *reinterpret_cast<__bf16x4_t*>(&array_b0); B0 = *reinterpret_cast<__bf16x4_t*>(&array_b1); D0 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B,C0); D1 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B0,C1); D2 = __builtin_hcu_mmac_f32_16x16x16bf16(A0,B,C2); D3 = __builtin_hcu_mmac_f32_16x16x16bf16(A0,B0,C3); d0 = D0.x; d1 = D0.y; d2 = D0.z; d3 = D0.w; d4 = D1.x; d5 = D1.y; d6 = D1.z; d7 = D1.w; d8 = D2.x; d9 = D2.y; d10 = D2.z; d11 = D2.w; d12 = D3.x; d13 = D3.y; d14 = D3.z; d15 = D3.w; #endif } }; struct GFX928_16x16x32_F32BF16BF16F32_NT { using DRegisters = float[4]; using ARegisters = bfloat16_t[8]; using BRegisters = bfloat16_t[8]; using CRegisters = float[4]; // Register asm fma HUTE_HOST_DEVICE static void fma(float &d0, float &d1, float &d2, float &d3, bfloat16_t const &a0, bfloat16_t const &a1, bfloat16_t const &a2, bfloat16_t const &a3, bfloat16_t const &a4, bfloat16_t const &a5, bfloat16_t const &a6, bfloat16_t const &a7, bfloat16_t const &b0, bfloat16_t const &b1, bfloat16_t const &b2, bfloat16_t const &b3, bfloat16_t const &b4, bfloat16_t const &b5, bfloat16_t const &b6, bfloat16_t const &b7, float const &c0, float const &c1, float const &c2, float const &c3) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f c; v4f d; c.x = c0; c.y = c1; c.z = c2; c.w = c3; hytlass::Array array_a0; hytlass::Array array_b0; array_a0[0] = a0; array_a0[1] = a1; array_a0[2] = a2; array_a0[3] = a3; array_b0[0] = b0; array_b0[1] = b1; array_b0[2] = b2; array_b0[3] = b3; __bf16x4_t A,B; A = *reinterpret_cast<__bf16x4_t*>(&array_a0); B = *reinterpret_cast<__bf16x4_t*>(&array_b0); d = __builtin_hcu_mmac_f32_16x16x16bf16(A,B,c); array_a0[0] = a4; array_a0[1] = a5; array_a0[2] = a6; array_a0[3] = a7; array_b0[0] = b4; array_b0[1] = b5; array_b0[2] = b6; array_b0[3] = b7; A = *reinterpret_cast<__bf16x4_t*>(&array_a0); B = *reinterpret_cast<__bf16x4_t*>(&array_b0); d = __builtin_hcu_mmac_f32_16x16x16bf16(A,B,d); d0 = d.x; d1 = d.y; d2 = d.z; d3 = d.w; #endif } }; struct GFX928_16x32x32_F32BF16BF16F32_NT { using DRegisters = float[8]; using ARegisters = bfloat16_t[8]; using BRegisters = bfloat16_t[16]; using CRegisters = float[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float & d4, float & d5, float & d6, float & d7, bfloat16_t const& a0, bfloat16_t const& a1,bfloat16_t const& a2, bfloat16_t const& a3, bfloat16_t const& a4, bfloat16_t const& a5,bfloat16_t const& a6, bfloat16_t const& a7, bfloat16_t const& b0, bfloat16_t const& b1,bfloat16_t const& b2, bfloat16_t const& b3, bfloat16_t const& b4, bfloat16_t const& b5,bfloat16_t const& b6, bfloat16_t const& b7, bfloat16_t const& b8, bfloat16_t const& b9,bfloat16_t const& b10, bfloat16_t const& b11, bfloat16_t const& b12, bfloat16_t const& b13,bfloat16_t const& b14, bfloat16_t const& b15, float const& c0, float const& c1, float const& c2, float const& c3, float const& c4, float const& c5, float const& c6, float const& c7) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f C0,C1; v4f D0,D1; C0.x = c0; C0.y = c1; C0.z = c2; C0.w = c3; C1.x = c4; C1.y = c5; C1.z = c6; C1.w = c7; hytlass::Array array_a0; hytlass::Array array_b0, array_b1; array_a0[0] = a0; array_a0[1] = a1; array_a0[2] = a2; array_a0[3] = a3; array_b0[0] = b0; array_b0[1] = b1; array_b0[2] = b2; array_b0[3] = b3; array_b1[0] = b8; array_b1[1] = b9; array_b1[2] = b10; array_b1[3] = b11; __bf16x4_t A,B,B0; A = *reinterpret_cast<__bf16x4_t*>(&array_a0); B = *reinterpret_cast<__bf16x4_t*>(&array_b0); B0 = *reinterpret_cast<__bf16x4_t*>(&array_b1); D0 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B,C0); D1 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B0,C1); array_a0[0] = a4; array_a0[1] = a5; array_a0[2] = a6; array_a0[3] = a7; array_b0[0] = b4; array_b0[1] = b5; array_b0[2] = b6; array_b0[3] = b7; array_b1[0] = b12; array_b1[1] = b13; array_b1[2] = b14; array_b1[3] = b15; A = *reinterpret_cast<__bf16x4_t*>(&array_a0); B = *reinterpret_cast<__bf16x4_t*>(&array_b0); B0 = *reinterpret_cast<__bf16x4_t*>(&array_b1); D0 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B,D0); D1 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B0,D1); d0 = D0.x; d1 = D0.y; d2 = D0.z; d3 = D0.w; d4 = D1.x; d5 = D1.y; d6 = D1.z; d7 = D1.w; #endif } }; struct GFX928_32x16x32_F32BF16BF16F32_NT { using DRegisters = float[8]; using ARegisters = bfloat16_t[16]; using BRegisters = bfloat16_t[8]; using CRegisters = float[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float & d4, float & d5, float & d6, float & d7, bfloat16_t const& a0, bfloat16_t const& a1,bfloat16_t const& a2, bfloat16_t const& a3, bfloat16_t const& a4, bfloat16_t const& a5,bfloat16_t const& a6, bfloat16_t const& a7, bfloat16_t const& a8, bfloat16_t const& a9,bfloat16_t const& a10, bfloat16_t const& a11, bfloat16_t const& a12, bfloat16_t const& a13,bfloat16_t const& a14, bfloat16_t const& a15, bfloat16_t const& b0, bfloat16_t const& b1,bfloat16_t const& b2, bfloat16_t const& b3, bfloat16_t const& b4, bfloat16_t const& b5,bfloat16_t const& b6, bfloat16_t const& b7, float const& c0, float const& c1, float const& c2, float const& c3, float const& c4, float const& c5, float const& c6, float const& c7) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f C0,C1; v4f D0,D1; C0.x = c0; C0.y = c1; C0.z = c2; C0.w = c3; C1.x = c4; C1.y = c5; C1.z = c6; C1.w = c7; hytlass::Array array_a0, array_a1; hytlass::Array array_b0; array_a0[0] = a0; array_a0[1] = a1; array_a0[2] = a2; array_a0[3] = a3; array_a1[0] = a8; array_a1[1] = a9; array_a1[2] = a10; array_a1[3] = a11; array_b0[0] = b0; array_b0[1] = b1; array_b0[2] = b2; array_b0[3] = b3; __bf16x4_t A,B,A0; A = *reinterpret_cast<__bf16x4_t*>(&array_a0); A0 = *reinterpret_cast<__bf16x4_t*>(&array_a1); B = *reinterpret_cast<__bf16x4_t*>(&array_b0); D0 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B,C0); D1 = __builtin_hcu_mmac_f32_16x16x16bf16(A0,B,C1); array_a0[0] = a4; array_a0[1] = a5; array_a0[2] = a6; array_a0[3] = a7; array_a1[0] = a12; array_a1[1] = a13; array_a1[2] = a14; array_a1[3] = a15; array_b0[0] = b4; array_b0[1] = b5; array_b0[2] = b6; array_b0[3] = b7; A = *reinterpret_cast<__bf16x4_t*>(&array_a0); A0 = *reinterpret_cast<__bf16x4_t*>(&array_a1); B = *reinterpret_cast<__bf16x4_t*>(&array_b0); D0 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B,D0); D1 = __builtin_hcu_mmac_f32_16x16x16bf16(A0,B,D1); d0 = D0.x; d1 = D0.y; d2 = D0.z; d3 = D0.w; d4 = D1.x; d5 = D1.y; d6 = D1.z; d7 = D1.w; #endif } }; struct GFX928_32x32x32_F32BF16BF16F32_NT { using DRegisters = float[16]; using ARegisters = bfloat16_t[16]; using BRegisters = bfloat16_t[16]; using CRegisters = float[16]; // Register asm fma HUTE_HOST_DEVICE static void fma(float & d0, float & d1, float & d2, float & d3, float & d4, float & d5, float & d6, float & d7, float & d8, float & d9, float & d10, float & d11, float & d12, float & d13, float & d14, float & d15, bfloat16_t const& a0, bfloat16_t const& a1,bfloat16_t const& a2, bfloat16_t const& a3, bfloat16_t const& a4, bfloat16_t const& a5,bfloat16_t const& a6, bfloat16_t const& a7, bfloat16_t const& a8, bfloat16_t const& a9,bfloat16_t const& a10, bfloat16_t const& a11, bfloat16_t const& a12, bfloat16_t const& a13,bfloat16_t const& a14, bfloat16_t const& a15, bfloat16_t const& b0, bfloat16_t const& b1,bfloat16_t const& b2, bfloat16_t const& b3, bfloat16_t const& b4, bfloat16_t const& b5,bfloat16_t const& b6, bfloat16_t const& b7, bfloat16_t const& b8, bfloat16_t const& b9,bfloat16_t const& b10, bfloat16_t const& b11, bfloat16_t const& b12, bfloat16_t const& b13,bfloat16_t const& b14, bfloat16_t const& b15, float const& c0, float const& c1, float const& c2, float const& c3, float const& c4, float const& c5, float const& c6, float const& c7, float const& c8, float const& c9, float const& c10, float const& c11, float const& c12, float const& c13, float const& c14, float const& c15) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) v4f C0,C1,C2,C3; v4f D0,D1,D2,D3; C0.x = c0; C0.y = c1; C0.z = c2; C0.w = c3; C1.x = c4; C1.y = c5; C1.z = c6; C1.w = c7; C2.x = c8; C2.y = c9; C2.z = c10; C2.w = c11; C3.x = c12; C3.y = c13; C3.z = c14; C3.w = c15; hytlass::Array array_a0, array_a1; hytlass::Array array_b0, array_b1; array_a0[0] = a0; array_a0[1] = a1; array_a0[2] = a2; array_a0[3] = a3; array_a1[0] = a8; array_a1[1] = a9; array_a1[2] = a10; array_a1[3] = a11; array_b0[0] = b0; array_b0[1] = b1; array_b0[2] = b2; array_b0[3] = b3; array_b1[0] = b8; array_b1[1] = b9; array_b1[2] = b10; array_b1[3] = b11; __bf16x4_t A,B,A0,B0; A = *reinterpret_cast<__bf16x4_t*>(&array_a0); A0 = *reinterpret_cast<__bf16x4_t*>(&array_a1); B = *reinterpret_cast<__bf16x4_t*>(&array_b0); B0 = *reinterpret_cast<__bf16x4_t*>(&array_b1); D0 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B,C0); D1 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B0,C1); D2 = __builtin_hcu_mmac_f32_16x16x16bf16(A0,B,C2); D3 = __builtin_hcu_mmac_f32_16x16x16bf16(A0,B0,C3); array_a0[0] = a4; array_a0[1] = a5; array_a0[2] = a6; array_a0[3] = a7; array_a1[0] = a12; array_a1[1] = a13; array_a1[2] = a14; array_a1[3] = a15; array_b0[0] = b4; array_b0[1] = b5; array_b0[2] = b6; array_b0[3] = b7; array_b1[0] = b12; array_b1[1] = b13; array_b1[2] = b14; array_b1[3] = b15; A = *reinterpret_cast<__bf16x4_t*>(&array_a0); A0 = *reinterpret_cast<__bf16x4_t*>(&array_a1); B = *reinterpret_cast<__bf16x4_t*>(&array_b0); B0 = *reinterpret_cast<__bf16x4_t*>(&array_b1); D0 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B,D0); D1 = __builtin_hcu_mmac_f32_16x16x16bf16(A,B0,D1); D2 = __builtin_hcu_mmac_f32_16x16x16bf16(A0,B,D2); D3 = __builtin_hcu_mmac_f32_16x16x16bf16(A0,B0,D3); d0 = D0.x; d1 = D0.y; d2 = D0.z; d3 = D0.w; d4 = D1.x; d5 = D1.y; d6 = D1.z; d7 = D1.w; d8 = D2.x; d9 = D2.y; d10 = D2.z; d11 = D2.w; d12 = D3.x; d13 = D3.y; d14 = D3.z; d15 = D3.w; #endif } }; /////////////////////////////////////v_mmac_i32_i8///////////////////////////////////// struct GFX928_16x16x32_I32I8I8I32_NT { using DRegisters = int[4]; using ARegisters = int8_t[8]; using BRegisters = int8_t[8]; using CRegisters = int[4]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d0, int & d1, int & d2, int & d3, int8_t const& a0, int8_t const& a1,int8_t const& a2, int8_t const& a3, int8_t const& a4, int8_t const& a5,int8_t const& a6, int8_t const& a7, int8_t const& b0, int8_t const& b1,int8_t const& b2, int8_t const& b3, int8_t const& b4, int8_t const& b5,int8_t const& b6, int8_t const& b7, int const& c0, int const& c1, int const& c2, int const& c3) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t c; intx4_t d; c.x = c0; c.y = c1; c.z = c2; c.w = c3; hytlass::Array a; a[0] = a0; a[1] = a1; a[2] = a2; a[3] = a3; a[4] = a4; a[5] = a5; a[6] = a6; a[7] = a7; hytlass::Array b; b[0] = b0;b[1] = b1;b[2] = b2;b[3] = b3; b[4] = b4;b[5] = b5;b[6] = b6;b[7] = b7; long A, B; A = *(reinterpret_cast(&a)); B = *(reinterpret_cast(&b)); d = __builtin_hcu_mmac_i32_16x16x32i8(A,B,c); d0 = d.x; d1 = d.y; d2 = d.z; d3 = d.w; #endif } }; struct GFX928_16x32x32_I32I8I8I32_NT { using DRegisters = int[8]; using ARegisters = int8_t[8]; using BRegisters = int8_t[16]; using CRegisters = int[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d0, int & d1, int & d2, int & d3, int & d4, int & d5, int & d6, int & d7, int8_t const& a0, int8_t const& a1,int8_t const& a2, int8_t const& a3, int8_t const& a4, int8_t const& a5,int8_t const& a6, int8_t const& a7, int8_t const& b0, int8_t const& b1,int8_t const& b2, int8_t const& b3, int8_t const& b4, int8_t const& b5,int8_t const& b6, int8_t const& b7, int8_t const& b8, int8_t const& b9,int8_t const& b10, int8_t const& b11, int8_t const& b12, int8_t const& b13,int8_t const& b14, int8_t const& b15, int const& c0, int const& c1, int const& c2, int const& c3, int const& c4, int const& c5, int const& c6, int const& c7) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t C, C0; intx4_t D, D0; C.x = c0; C.y = c1; C.z = c2; C.w = c3; hytlass::Array a; a[0] = a0; a[1] = a1; a[2] = a2; a[3] = a3; a[4] = a4; a[5] = a5; a[6] = a6; a[7] = a7; hytlass::Array b; b[0] = b0;b[1] = b1;b[2] = b2;b[3] = b3; b[4] = b4;b[5] = b5;b[6] = b6;b[7] = b7; long A, B; A = *(reinterpret_cast(&a)); B = *(reinterpret_cast(&b)); D = __builtin_hcu_mmac_i32_16x16x32i8(A,B,C); C0.x = c4; C0.y = c5; C0.z = c6; C0.w = c7; b[0] = b8;b[1] = b9;b[2] = b10;b[3] = b11; b[4] = b12;b[5] = b13;b[6] = b14;b[7] = b15; B = *(reinterpret_cast(&b)); D0 = __builtin_hcu_mmac_i32_16x16x32i8(A,B,C0); d0 = D.x; d1 = D.y; d2 = D.z; d3 = D.w; d4 = D0.x; d5 = D0.y; d6 = D0.z; d7 = D0.w; #endif } }; struct GFX928_32x16x32_I32I8I8I32_NT { using DRegisters = int[8]; using ARegisters = int8_t[16]; using BRegisters = int8_t[8]; using CRegisters = int[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d0, int & d1, int & d2, int & d3, int & d4, int & d5, int & d6, int & d7, int8_t const& a0, int8_t const& a1,int8_t const& a2, int8_t const& a3, int8_t const& a4, int8_t const& a5,int8_t const& a6, int8_t const& a7, int8_t const& a8, int8_t const& a9, int8_t const& a10, int8_t const& a11, int8_t const& a12, int8_t const& a13, int8_t const& a14, int8_t const& a15, int8_t const& b0, int8_t const& b1,int8_t const& b2, int8_t const& b3, int8_t const& b4, int8_t const& b5,int8_t const& b6, int8_t const& b7, int const& c0, int const& c1, int const& c2, int const& c3, int const& c4, int const& c5, int const& c6, int const& c7) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t C, C0; intx4_t D, D0; C.x = c0; C.y = c1; C.z = c2; C.w = c3; hytlass::Array a; a[0] = a0; a[1] = a1; a[2] = a2; a[3] = a3; a[4] = a4; a[5] = a5; a[6] = a6; a[7] = a7; hytlass::Array b; b[0] = b0;b[1] = b1;b[2] = b2;b[3] = b3; b[4] = b4;b[5] = b5;b[6] = b6;b[7] = b7; long A, B; A = *(reinterpret_cast(&a)); B = *(reinterpret_cast(&b)); D = __builtin_hcu_mmac_i32_16x16x32i8(A,B,C); C0.x = c4; C0.y = c5; C0.z = c6; C0.w = c7; a[0] = a8;a[1] = a9;a[2] = a10;a[3] = a11; a[4] = a12;a[5] = a13;a[6] = a14;a[7] = a15; A = *(reinterpret_cast(&a)); D0 = __builtin_hcu_mmac_i32_16x16x32i8(A,B,C0); d0 = D.x; d1 = D.y; d2 = D.z; d3 = D.w; d4 = D0.x; d5 = D0.y; d6 = D0.z; d7 = D0.w; #endif } }; struct GFX928_32x32x32_I32I8I8I32_NT { using DRegisters = int[16]; using ARegisters = int8_t[16]; using BRegisters = int8_t[16]; using CRegisters = int[16]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d00, int & d01, int & d02, int & d03, int & d04, int & d05, int & d06, int & d07, int & d08, int & d09, int & d10, int & d11, int & d12, int & d13, int & d14, int & d15, int8_t const& a00, int8_t const& a01, int8_t const& a02, int8_t const& a03, int8_t const& a04, int8_t const& a05, int8_t const& a06, int8_t const& a07, int8_t const& a08, int8_t const& a09, int8_t const& a10, int8_t const& a11, int8_t const& a12, int8_t const& a13, int8_t const& a14, int8_t const& a15, int8_t const& b00, int8_t const& b01, int8_t const& b02, int8_t const& b03, int8_t const& b04, int8_t const& b05, int8_t const& b06, int8_t const& b07, int8_t const& b08, int8_t const& b09, int8_t const& b10, int8_t const& b11, int8_t const& b12, int8_t const& b13, int8_t const& b14, int8_t const& b15, int const& c00, int const& c01, int const& c02, int const& c03, int const& c04, int const& c05, int const& c06, int const& c07, int const& c08, int const& c09, int const& c10, int const& c11, int const& c12, int const& c13, int const& c14, int const& c15) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t C0, C1, C2, C3; intx4_t D0, D1, D2, D3; C0.x = c00; C0.y = c01; C0.z = c02; C0.w = c03; C1.x = c04; C1.y = c05; C1.z = c06; C1.w = c07; C2.x = c08; C2.y = c09; C2.z = c10; C2.w = c11; C3.x = c12; C3.y = c13; C3.z = c14; C3.w = c15; hytlass::Array a; a[0] = a00; a[1] = a01; a[2] = a02; a[3] = a03; a[4] = a04; a[5] = a05; a[6] = a06; a[7] = a07; a[8] = a08; a[9] = a09; a[10] = a10; a[11] = a11; a[12] = a12; a[13] = a13; a[14] = a14; a[15] = a15; hytlass::Array b; b[0] = b00; b[1] = b01; b[2] = b02; b[3] = b03; b[4] = b04; b[5] = b05; b[6] = b06; b[7] = b07; b[8] = b08; b[9] = b09; b[10] = b10; b[11] = b11; b[12] = b12; b[13] = b13; b[14] = b14; b[15] = b15; long A0, A1, B0, B1; A0 = *(reinterpret_cast(&a)); B0 = *(reinterpret_cast(&b)); A1 = *(reinterpret_cast(&a) + 1); B1 = *(reinterpret_cast(&b) + 1); D0 = __builtin_hcu_mmac_i32_16x16x32i8(A0, B0, C0); D1 = __builtin_hcu_mmac_i32_16x16x32i8(A0, B1, C1); D2 = __builtin_hcu_mmac_i32_16x16x32i8(A1, B0, C2); D3 = __builtin_hcu_mmac_i32_16x16x32i8(A1, B1, C3); d00 = D0.x; d01 = D0.y; d02 = D0.z; d03 = D0.w; d04 = D1.x; d05 = D1.y; d06 = D1.z; d07 = D1.w; d08 = D2.x; d09 = D2.y; d10 = D2.z; d11 = D2.w; d12 = D3.x; d13 = D3.y; d14 = D3.z; d15 = D3.w; #endif } }; struct GFX928_16x16x64_I32I8I8I32_NT { using DRegisters = int[4]; using ARegisters = int8_t[16]; using BRegisters = int8_t[16]; using CRegisters = int[4]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d0, int & d1, int & d2, int & d3, int8_t const& a0, int8_t const& a1,int8_t const& a2, int8_t const& a3, int8_t const& a4, int8_t const& a5,int8_t const& a6, int8_t const& a7, int8_t const& a8, int8_t const& a9,int8_t const& a10, int8_t const& a11, int8_t const& a12, int8_t const& a13,int8_t const& a14, int8_t const& a15, int8_t const& b0, int8_t const& b1,int8_t const& b2, int8_t const& b3, int8_t const& b4, int8_t const& b5,int8_t const& b6, int8_t const& b7, int8_t const& b8, int8_t const& b9,int8_t const& b10, int8_t const& b11, int8_t const& b12, int8_t const& b13,int8_t const& b14, int8_t const& b15, int const& c0, int const& c1, int const& c2, int const& c3) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t c; intx4_t d; c.x = c0; c.y = c1; c.z = c2; c.w = c3; hytlass::Array a; a[0] = a0; a[1] = a1; a[2] = a2; a[3] = a3; a[4] = a4; a[5] = a5; a[6] = a6; a[7] = a7; hytlass::Array b; b[0] = b0;b[1] = b1;b[2] = b2;b[3] = b3; b[4] = b4;b[5] = b5;b[6] = b6;b[7] = b7; long A, B; A = *(reinterpret_cast(&a)); B = *(reinterpret_cast(&b)); d = __builtin_hcu_mmac_i32_16x16x32i8(A,B,c); a[0] = a8; a[1] = a9; a[2] = a10; a[3] = a11; a[4] = a12; a[5] = a13; a[6] = a14; a[7] = a15; b[0] = b8; b[1] = b9; b[2] = b10; b[3] = b11; b[4] = b12; b[5] = b13; b[6] = b14; b[7] = b15; A = *(reinterpret_cast(&a)); B = *(reinterpret_cast(&b)); d = __builtin_hcu_mmac_i32_16x16x32i8(A,B,d); d0 = d.x; d1 = d.y; d2 = d.z; d3 = d.w; #endif } }; struct GFX928_16x32x64_I32I8I8I32_NT { using DRegisters = int[8]; using ARegisters = int8_t[16]; using BRegisters = int8_t[32]; using CRegisters = int[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d00, int & d01, int & d02, int & d03, int & d04, int & d05, int & d06, int & d07, int8_t const& a00, int8_t const& a01, int8_t const& a02, int8_t const& a03, int8_t const& a04, int8_t const& a05, int8_t const& a06, int8_t const& a07, int8_t const& a08, int8_t const& a09, int8_t const& a10, int8_t const& a11, int8_t const& a12, int8_t const& a13, int8_t const& a14, int8_t const& a15, int8_t const& b00, int8_t const& b01, int8_t const& b02, int8_t const& b03, int8_t const& b04, int8_t const& b05, int8_t const& b06, int8_t const& b07, int8_t const& b08, int8_t const& b09, int8_t const& b10, int8_t const& b11, int8_t const& b12, int8_t const& b13, int8_t const& b14, int8_t const& b15, int8_t const& b16, int8_t const& b17, int8_t const& b18, int8_t const& b19, int8_t const& b20, int8_t const& b21, int8_t const& b22, int8_t const& b23, int8_t const& b24, int8_t const& b25, int8_t const& b26, int8_t const& b27, int8_t const& b28, int8_t const& b29, int8_t const& b30, int8_t const& b31, int const& c00, int const& c01, int const& c02, int const& c03, int const& c04, int const& c05, int const& c06, int const& c07) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t C0, C1; intx4_t D0, D1; C0.x = c00; C0.y = c01; C0.z = c02; C0.w = c03; C1.x = c04; C1.y = c05; C1.z = c06; C1.w = c07; hytlass::Array a; a[0] = a00; a[1] = a01; a[2] = a02; a[3] = a03; a[4] = a04; a[5] = a05; a[6] = a06; a[7] = a07; hytlass::Array b; b[0] = b00; b[1] = b01; b[2] = b02; b[3] = b03; b[4] = b04; b[5] = b05; b[6] = b06; b[7] = b07; b[8] = b16; b[9] = b17; b[10] = b18; b[11] = b19; b[12] = b20; b[13] = b21; b[14] = b22; b[15] = b23; long A, B0, B1; A = *(reinterpret_cast(&a)); B0 = *(reinterpret_cast(&b)); B1 = *(reinterpret_cast(&b) + 1); D0 = __builtin_hcu_mmac_i32_16x16x32i8(A, B0, C0); D1 = __builtin_hcu_mmac_i32_16x16x32i8(A, B1, C1); a[0] = a08; a[1] = a09; a[2] = a10; a[3] = a11; a[4] = a12; a[5] = a13; a[6] = a14; a[7] = a15; b[0] = b08; b[1] = b09; b[2] = b10; b[3] = b11; b[4] = b12; b[5] = b13; b[6] = b14; b[7] = b15; b[8] = b24; b[9] = b25; b[10] = b26; b[11] = b27; b[12] = b28; b[13] = b29; b[14] = b30; b[15] = b31; A = *(reinterpret_cast(&a)); B0 = *(reinterpret_cast(&b)); B1 = *(reinterpret_cast(&b) + 1); D0 = __builtin_hcu_mmac_i32_16x16x32i8(A, B0, D0); D1 = __builtin_hcu_mmac_i32_16x16x32i8(A, B1, D1); d00 = D0.x; d01 = D0.y; d02 = D0.z; d03 = D0.w; d04 = D1.x; d05 = D1.y; d06 = D1.z; d07 = D1.w; #endif } }; struct GFX928_32x16x64_I32I8I8I32_NT { using DRegisters = int[8]; using ARegisters = int8_t[32]; using BRegisters = int8_t[16]; using CRegisters = int[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d00, int & d01, int & d02, int & d03, int & d04, int & d05, int & d06, int & d07, int8_t const& a00, int8_t const& a01, int8_t const& a02, int8_t const& a03, int8_t const& a04, int8_t const& a05, int8_t const& a06, int8_t const& a07, int8_t const& a08, int8_t const& a09, int8_t const& a10, int8_t const& a11, int8_t const& a12, int8_t const& a13, int8_t const& a14, int8_t const& a15, int8_t const& a16, int8_t const& a17, int8_t const& a18, int8_t const& a19, int8_t const& a20, int8_t const& a21, int8_t const& a22, int8_t const& a23, int8_t const& a24, int8_t const& a25, int8_t const& a26, int8_t const& a27, int8_t const& a28, int8_t const& a29, int8_t const& a30, int8_t const& a31, int8_t const& b00, int8_t const& b01, int8_t const& b02, int8_t const& b03, int8_t const& b04, int8_t const& b05, int8_t const& b06, int8_t const& b07, int8_t const& b08, int8_t const& b09, int8_t const& b10, int8_t const& b11, int8_t const& b12, int8_t const& b13, int8_t const& b14, int8_t const& b15, int const& c00, int const& c01, int const& c02, int const& c03, int const& c04, int const& c05, int const& c06, int const& c07) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t C0, C1; intx4_t D0, D1; C0.x = c00; C0.y = c01; C0.z = c02; C0.w = c03; C1.x = c04; C1.y = c05; C1.z = c06; C1.w = c07; hytlass::Array a; a[0] = a00; a[1] = a01; a[2] = a02; a[3] = a03; a[4] = a04; a[5] = a05; a[6] = a06; a[7] = a07; a[8] = a16; a[9] = a17; a[10] = a18; a[11] = a19; a[12] = a20; a[13] = a21; a[14] = a22; a[15] = a23; hytlass::Array b; b[0] = b00; b[1] = b01; b[2] = b02; b[3] = b03; b[4] = b04; b[5] = b05; b[6] = b06; b[7] = b07; long A0, A1, B; A0 = *(reinterpret_cast(&a)); A1 = *(reinterpret_cast(&a) + 1); B = *(reinterpret_cast(&b)); D0 = __builtin_hcu_mmac_i32_16x16x32i8(A0, B, C0); D1 = __builtin_hcu_mmac_i32_16x16x32i8(A1, B, C1); a[0] = a08; a[1] = a09; a[2] = a10; a[3] = a11; a[4] = a12; a[5] = a13; a[6] = a14; a[7] = a15; a[8] = a24; a[9] = a25; a[10] = a26; a[11] = a27; a[12] = a28; a[13] = a29; a[14] = a30; a[15] = a31; b[0] = b08; b[1] = b09; b[2] = b10; b[3] = b11; b[4] = b12; b[5] = b13; b[6] = b14; b[7] = b15; A0 = *(reinterpret_cast(&a)); A1 = *(reinterpret_cast(&a) + 1); B = *(reinterpret_cast(&b)); D0 = __builtin_hcu_mmac_i32_16x16x32i8(A0, B, D0); D1 = __builtin_hcu_mmac_i32_16x16x32i8(A1, B, D1); d00 = D0.x; d01 = D0.y; d02 = D0.z; d03 = D0.w; d04 = D1.x; d05 = D1.y; d06 = D1.z; d07 = D1.w; #endif } }; struct GFX928_32x32x64_I32I8I8I32_NT { using DRegisters = int[16]; using ARegisters = int8_t[32]; using BRegisters = int8_t[32]; using CRegisters = int[16]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d00, int & d01, int & d02, int & d03, int & d04, int & d05, int & d06, int & d07, int & d08, int & d09, int & d10, int & d11, int & d12, int & d13, int & d14, int & d15, int8_t const& a00, int8_t const& a01, int8_t const& a02, int8_t const& a03, int8_t const& a04, int8_t const& a05, int8_t const& a06, int8_t const& a07, int8_t const& a08, int8_t const& a09, int8_t const& a10, int8_t const& a11, int8_t const& a12, int8_t const& a13, int8_t const& a14, int8_t const& a15, int8_t const& a16, int8_t const& a17, int8_t const& a18, int8_t const& a19, int8_t const& a20, int8_t const& a21, int8_t const& a22, int8_t const& a23, int8_t const& a24, int8_t const& a25, int8_t const& a26, int8_t const& a27, int8_t const& a28, int8_t const& a29, int8_t const& a30, int8_t const& a31, int8_t const& b00, int8_t const& b01, int8_t const& b02, int8_t const& b03, int8_t const& b04, int8_t const& b05, int8_t const& b06, int8_t const& b07, int8_t const& b08, int8_t const& b09, int8_t const& b10, int8_t const& b11, int8_t const& b12, int8_t const& b13, int8_t const& b14, int8_t const& b15, int8_t const& b16, int8_t const& b17, int8_t const& b18, int8_t const& b19, int8_t const& b20, int8_t const& b21, int8_t const& b22, int8_t const& b23, int8_t const& b24, int8_t const& b25, int8_t const& b26, int8_t const& b27, int8_t const& b28, int8_t const& b29, int8_t const& b30, int8_t const& b31, int const& c00, int const& c01, int const& c02, int const& c03, int const& c04, int const& c05, int const& c06, int const& c07, int const& c08, int const& c09, int const& c10, int const& c11, int const& c12, int const& c13, int const& c14, int const& c15) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t C0, C1, C2, C3; intx4_t D0, D1, D2, D3; C0.x = c00; C0.y = c01; C0.z = c02; C0.w = c03; C1.x = c04; C1.y = c05; C1.z = c06; C1.w = c07; C2.x = c08; C2.y = c09; C2.z = c10; C2.w = c11; C3.x = c12; C3.y = c13; C3.z = c14; C3.w = c15; hytlass::Array a; a[0] = a00; a[1] = a01; a[2] = a02; a[3] = a03; a[4] = a04; a[5] = a05; a[6] = a06; a[7] = a07; a[8] = a16; a[9] = a17; a[10] = a18; a[11] = a19; a[12] = a20; a[13] = a21; a[14] = a22; a[15] = a23; hytlass::Array b; b[0] = b00; b[1] = b01; b[2] = b02; b[3] = b03; b[4] = b04; b[5] = b05; b[6] = b06; b[7] = b07; b[8] = b16; b[9] = b17; b[10] = b18; b[11] = b19; b[12] = b20; b[13] = b21; b[14] = b22; b[15] = b23; long A0, A1, B0, B1; A0 = *(reinterpret_cast(&a)); B0 = *(reinterpret_cast(&b)); A1 = *(reinterpret_cast(&a) + 1); B1 = *(reinterpret_cast(&b) + 1); D0 = __builtin_hcu_mmac_i32_16x16x32i8(A0, B0, C0); D1 = __builtin_hcu_mmac_i32_16x16x32i8(A0, B1, C1); D2 = __builtin_hcu_mmac_i32_16x16x32i8(A1, B0, C2); D3 = __builtin_hcu_mmac_i32_16x16x32i8(A1, B1, C3); a[0] = a08; a[1] = a09; a[2] = a10; a[3] = a11; a[4] = a12; a[5] = a13; a[6] = a14; a[7] = a15; a[8] = a24; a[9] = a25; a[10] = a26; a[11] = a27; a[12] = a28; a[13] = a29; a[14] = a30; a[15] = a31; b[0] = b08; b[1] = b09; b[2] = b10; b[3] = b11; b[4] = b12; b[5] = b13; b[6] = b14; b[7] = b15; b[8] = b24; b[9] = b25; b[10] = b26; b[11] = b27; b[12] = b28; b[13] = b29; b[14] = b30; b[15] = b31; A0 = *(reinterpret_cast(&a)); B0 = *(reinterpret_cast(&b)); A1 = *(reinterpret_cast(&a) + 1); B1 = *(reinterpret_cast(&b) + 1); D0 = __builtin_hcu_mmac_i32_16x16x32i8(A0, B0, D0); D1 = __builtin_hcu_mmac_i32_16x16x32i8(A0, B1, D1); D2 = __builtin_hcu_mmac_i32_16x16x32i8(A1, B0, D2); D3 = __builtin_hcu_mmac_i32_16x16x32i8(A1, B1, D3); d00 = D0.x; d01 = D0.y; d02 = D0.z; d03 = D0.w; d04 = D1.x; d05 = D1.y; d06 = D1.z; d07 = D1.w; d08 = D2.x; d09 = D2.y; d10 = D2.z; d11 = D2.w; d12 = D3.x; d13 = D3.y; d14 = D3.z; d15 = D3.w; #endif } }; /////////////////////////////////////v_mmac_i32_u8///////////////////////////////////// struct GFX928_16x16x32_I32U8U8I32_NT { using DRegisters = int[4]; using ARegisters = uint8_t[8]; using BRegisters = uint8_t[8]; using CRegisters = int[4]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d0, int & d1, int & d2, int & d3, uint8_t const& a0, uint8_t const& a1,uint8_t const& a2, uint8_t const& a3, uint8_t const& a4, uint8_t const& a5,uint8_t const& a6, uint8_t const& a7, uint8_t const& b0, uint8_t const& b1,uint8_t const& b2, uint8_t const& b3, uint8_t const& b4, uint8_t const& b5,uint8_t const& b6, uint8_t const& b7, int const& c0, int const& c1, int const& c2, int const& c3) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t c; intx4_t d; c.x = c0; c.y = c1; c.z = c2; c.w = c3; hytlass::Array a; a[0] = a0; a[1] = a1; a[2] = a2; a[3] = a3; a[4] = a4; a[5] = a5; a[6] = a6; a[7] = a7; hytlass::Array b; b[0] = b0;b[1] = b1;b[2] = b2;b[3] = b3; b[4] = b4;b[5] = b5;b[6] = b6;b[7] = b7; long A, B; A = *(reinterpret_cast(&a)); B = *(reinterpret_cast(&b)); d = __builtin_hcu_mmac_i32_16x16x32u8(A,B,c); d0 = d.x; d1 = d.y; d2 = d.z; d3 = d.w; #endif } }; struct GFX928_16x32x32_I32U8U8I32_NT { using DRegisters = int[8]; using ARegisters = uint8_t[8]; using BRegisters = uint8_t[16]; using CRegisters = int[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d0, int & d1, int & d2, int & d3, int & d4, int & d5, int & d6, int & d7, uint8_t const& a0, uint8_t const& a1,uint8_t const& a2, uint8_t const& a3, uint8_t const& a4, uint8_t const& a5,uint8_t const& a6, uint8_t const& a7, uint8_t const& b0, uint8_t const& b1,uint8_t const& b2, uint8_t const& b3, uint8_t const& b4, uint8_t const& b5,uint8_t const& b6, uint8_t const& b7, uint8_t const& b8, uint8_t const& b9,uint8_t const& b10, uint8_t const& b11, uint8_t const& b12, uint8_t const& b13,uint8_t const& b14, uint8_t const& b15, int const& c0, int const& c1, int const& c2, int const& c3, int const& c4, int const& c5, int const& c6, int const& c7) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t C, C0; intx4_t D, D0; C.x = c0; C.y = c1; C.z = c2; C.w = c3; hytlass::Array a; a[0] = a0; a[1] = a1; a[2] = a2; a[3] = a3; a[4] = a4; a[5] = a5; a[6] = a6; a[7] = a7; hytlass::Array b; b[0] = b0;b[1] = b1;b[2] = b2;b[3] = b3; b[4] = b4;b[5] = b5;b[6] = b6;b[7] = b7; long A, B; A = *(reinterpret_cast(&a)); B = *(reinterpret_cast(&b)); D = __builtin_hcu_mmac_i32_16x16x32u8(A,B,C); C0.x = c4; C0.y = c5; C0.z = c6; C0.w = c7; b[0] = b8;b[1] = b9;b[2] = b10;b[3] = b11; b[4] = b12;b[5] = b13;b[6] = b14;b[7] = b15; B = *(reinterpret_cast(&b)); D0 = __builtin_hcu_mmac_i32_16x16x32u8(A,B,C0); d0 = D.x; d1 = D.y; d2 = D.z; d3 = D.w; d4 = D0.x; d5 = D0.y; d6 = D0.z; d7 = D0.w; #endif } }; struct GFX928_32x16x32_I32U8U8I32_NT { using DRegisters = int[8]; using ARegisters = uint8_t[16]; using BRegisters = uint8_t[8]; using CRegisters = int[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d0, int & d1, int & d2, int & d3, int & d4, int & d5, int & d6, int & d7, uint8_t const& a0, uint8_t const& a1,uint8_t const& a2, uint8_t const& a3, uint8_t const& a4, uint8_t const& a5,uint8_t const& a6, uint8_t const& a7, uint8_t const& a8, uint8_t const& a9, uint8_t const& a10, uint8_t const& a11, uint8_t const& a12, uint8_t const& a13, uint8_t const& a14, uint8_t const& a15, uint8_t const& b0, uint8_t const& b1,uint8_t const& b2, uint8_t const& b3, uint8_t const& b4, uint8_t const& b5,uint8_t const& b6, uint8_t const& b7, int const& c0, int const& c1, int const& c2, int const& c3, int const& c4, int const& c5, int const& c6, int const& c7) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t C, C0; intx4_t D, D0; C.x = c0; C.y = c1; C.z = c2; C.w = c3; hytlass::Array a; a[0] = a0; a[1] = a1; a[2] = a2; a[3] = a3; a[4] = a4; a[5] = a5; a[6] = a6; a[7] = a7; hytlass::Array b; b[0] = b0;b[1] = b1;b[2] = b2;b[3] = b3; b[4] = b4;b[5] = b5;b[6] = b6;b[7] = b7; long A, B; A = *(reinterpret_cast(&a)); B = *(reinterpret_cast(&b)); D = __builtin_hcu_mmac_i32_16x16x32u8(A,B,C); C0.x = c4; C0.y = c5; C0.z = c6; C0.w = c7; a[0] = a8;a[1] = a9;a[2] = a10;a[3] = a11; a[4] = a12;a[5] = a13;a[6] = a14;a[7] = a15; A = *(reinterpret_cast(&a)); D0 = __builtin_hcu_mmac_i32_16x16x32u8(A,B,C0); d0 = D.x; d1 = D.y; d2 = D.z; d3 = D.w; d4 = D0.x; d5 = D0.y; d6 = D0.z; d7 = D0.w; #endif } }; struct GFX928_32x32x32_I32U8U8I32_NT { using DRegisters = int[16]; using ARegisters = uint8_t[16]; using BRegisters = uint8_t[16]; using CRegisters = int[16]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d00, int & d01, int & d02, int & d03, int & d04, int & d05, int & d06, int & d07, int & d08, int & d09, int & d10, int & d11, int & d12, int & d13, int & d14, int & d15, uint8_t const& a00, uint8_t const& a01, uint8_t const& a02, uint8_t const& a03, uint8_t const& a04, uint8_t const& a05, uint8_t const& a06, uint8_t const& a07, uint8_t const& a08, uint8_t const& a09, uint8_t const& a10, uint8_t const& a11, uint8_t const& a12, uint8_t const& a13, uint8_t const& a14, uint8_t const& a15, uint8_t const& b00, uint8_t const& b01, uint8_t const& b02, uint8_t const& b03, uint8_t const& b04, uint8_t const& b05, uint8_t const& b06, uint8_t const& b07, uint8_t const& b08, uint8_t const& b09, uint8_t const& b10, uint8_t const& b11, uint8_t const& b12, uint8_t const& b13, uint8_t const& b14, uint8_t const& b15, int const& c00, int const& c01, int const& c02, int const& c03, int const& c04, int const& c05, int const& c06, int const& c07, int const& c08, int const& c09, int const& c10, int const& c11, int const& c12, int const& c13, int const& c14, int const& c15) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t C0, C1, C2, C3; intx4_t D0, D1, D2, D3; C0.x = c00; C0.y = c01; C0.z = c02; C0.w = c03; C1.x = c04; C1.y = c05; C1.z = c06; C1.w = c07; C2.x = c08; C2.y = c09; C2.z = c10; C2.w = c11; C3.x = c12; C3.y = c13; C3.z = c14; C3.w = c15; hytlass::Array a; a[0] = a00; a[1] = a01; a[2] = a02; a[3] = a03; a[4] = a04; a[5] = a05; a[6] = a06; a[7] = a07; a[8] = a08; a[9] = a09; a[10] = a10; a[11] = a11; a[12] = a12; a[13] = a13; a[14] = a14; a[15] = a15; hytlass::Array b; b[0] = b00; b[1] = b01; b[2] = b02; b[3] = b03; b[4] = b04; b[5] = b05; b[6] = b06; b[7] = b07; b[8] = b08; b[9] = b09; b[10] = b10; b[11] = b11; b[12] = b12; b[13] = b13; b[14] = b14; b[15] = b15; long A0, A1, B0, B1; A0 = *(reinterpret_cast(&a)); B0 = *(reinterpret_cast(&b)); A1 = *(reinterpret_cast(&a) + 1); B1 = *(reinterpret_cast(&b) + 1); D0 = __builtin_hcu_mmac_i32_16x16x32u8(A0, B0, C0); D1 = __builtin_hcu_mmac_i32_16x16x32u8(A0, B1, C1); D2 = __builtin_hcu_mmac_i32_16x16x32u8(A1, B0, C2); D3 = __builtin_hcu_mmac_i32_16x16x32u8(A1, B1, C3); d00 = D0.x; d01 = D0.y; d02 = D0.z; d03 = D0.w; d04 = D1.x; d05 = D1.y; d06 = D1.z; d07 = D1.w; d08 = D2.x; d09 = D2.y; d10 = D2.z; d11 = D2.w; d12 = D3.x; d13 = D3.y; d14 = D3.z; d15 = D3.w; #endif } }; struct GFX928_16x16x64_I32U8U8I32_NT { using DRegisters = int[4]; using ARegisters = uint8_t[16]; using BRegisters = uint8_t[16]; using CRegisters = int[4]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d0, int & d1, int & d2, int & d3, uint8_t const& a0, uint8_t const& a1,uint8_t const& a2, uint8_t const& a3, uint8_t const& a4, uint8_t const& a5,uint8_t const& a6, uint8_t const& a7, uint8_t const& a8, uint8_t const& a9,uint8_t const& a10, uint8_t const& a11, uint8_t const& a12, uint8_t const& a13,uint8_t const& a14, uint8_t const& a15, uint8_t const& b0, uint8_t const& b1,uint8_t const& b2, uint8_t const& b3, uint8_t const& b4, uint8_t const& b5,uint8_t const& b6, uint8_t const& b7, uint8_t const& b8, uint8_t const& b9,uint8_t const& b10, uint8_t const& b11, uint8_t const& b12, uint8_t const& b13,uint8_t const& b14, uint8_t const& b15, int const& c0, int const& c1, int const& c2, int const& c3) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t c; intx4_t d; c.x = c0; c.y = c1; c.z = c2; c.w = c3; hytlass::Array a; a[0] = a0; a[1] = a1; a[2] = a2; a[3] = a3; a[4] = a4; a[5] = a5; a[6] = a6; a[7] = a7; hytlass::Array b; b[0] = b0;b[1] = b1;b[2] = b2;b[3] = b3; b[4] = b4;b[5] = b5;b[6] = b6;b[7] = b7; long A, B; A = *(reinterpret_cast(&a)); B = *(reinterpret_cast(&b)); d = __builtin_hcu_mmac_i32_16x16x32u8(A,B,c); a[0] = a8; a[1] = a9; a[2] = a10; a[3] = a11; a[4] = a12; a[5] = a13; a[6] = a14; a[7] = a15; b[0] = b8; b[1] = b9; b[2] = b10; b[3] = b11; b[4] = b12; b[5] = b13; b[6] = b14; b[7] = b15; A = *(reinterpret_cast(&a)); B = *(reinterpret_cast(&b)); d = __builtin_hcu_mmac_i32_16x16x32u8(A,B,d); d0 = d.x; d1 = d.y; d2 = d.z; d3 = d.w; #endif } }; struct GFX928_16x32x64_I32U8U8I32_NT { using DRegisters = int[8]; using ARegisters = uint8_t[16]; using BRegisters = uint8_t[32]; using CRegisters = int[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d00, int & d01, int & d02, int & d03, int & d04, int & d05, int & d06, int & d07, uint8_t const& a00, uint8_t const& a01, uint8_t const& a02, uint8_t const& a03, uint8_t const& a04, uint8_t const& a05, uint8_t const& a06, uint8_t const& a07, uint8_t const& a08, uint8_t const& a09, uint8_t const& a10, uint8_t const& a11, uint8_t const& a12, uint8_t const& a13, uint8_t const& a14, uint8_t const& a15, uint8_t const& b00, uint8_t const& b01, uint8_t const& b02, uint8_t const& b03, uint8_t const& b04, uint8_t const& b05, uint8_t const& b06, uint8_t const& b07, uint8_t const& b08, uint8_t const& b09, uint8_t const& b10, uint8_t const& b11, uint8_t const& b12, uint8_t const& b13, uint8_t const& b14, uint8_t const& b15, uint8_t const& b16, uint8_t const& b17, uint8_t const& b18, uint8_t const& b19, uint8_t const& b20, uint8_t const& b21, uint8_t const& b22, uint8_t const& b23, uint8_t const& b24, uint8_t const& b25, uint8_t const& b26, uint8_t const& b27, uint8_t const& b28, uint8_t const& b29, uint8_t const& b30, uint8_t const& b31, int const& c00, int const& c01, int const& c02, int const& c03, int const& c04, int const& c05, int const& c06, int const& c07) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t C0, C1; intx4_t D0, D1; C0.x = c00; C0.y = c01; C0.z = c02; C0.w = c03; C1.x = c04; C1.y = c05; C1.z = c06; C1.w = c07; hytlass::Array a; a[0] = a00; a[1] = a01; a[2] = a02; a[3] = a03; a[4] = a04; a[5] = a05; a[6] = a06; a[7] = a07; hytlass::Array b; b[0] = b00; b[1] = b01; b[2] = b02; b[3] = b03; b[4] = b04; b[5] = b05; b[6] = b06; b[7] = b07; b[8] = b16; b[9] = b17; b[10] = b18; b[11] = b19; b[12] = b20; b[13] = b21; b[14] = b22; b[15] = b23; long A, B0, B1; A = *(reinterpret_cast(&a)); B0 = *(reinterpret_cast(&b)); B1 = *(reinterpret_cast(&b) + 1); D0 = __builtin_hcu_mmac_i32_16x16x32u8(A, B0, C0); D1 = __builtin_hcu_mmac_i32_16x16x32u8(A, B1, C1); a[0] = a08; a[1] = a09; a[2] = a10; a[3] = a11; a[4] = a12; a[5] = a13; a[6] = a14; a[7] = a15; b[0] = b08; b[1] = b09; b[2] = b10; b[3] = b11; b[4] = b12; b[5] = b13; b[6] = b14; b[7] = b15; b[8] = b24; b[9] = b25; b[10] = b26; b[11] = b27; b[12] = b28; b[13] = b29; b[14] = b30; b[15] = b31; A = *(reinterpret_cast(&a)); B0 = *(reinterpret_cast(&b)); B1 = *(reinterpret_cast(&b) + 1); D0 = __builtin_hcu_mmac_i32_16x16x32u8(A, B0, D0); D1 = __builtin_hcu_mmac_i32_16x16x32u8(A, B1, D1); d00 = D0.x; d01 = D0.y; d02 = D0.z; d03 = D0.w; d04 = D1.x; d05 = D1.y; d06 = D1.z; d07 = D1.w; #endif } }; struct GFX928_32x16x64_I32U8U8I32_NT { using DRegisters = int[8]; using ARegisters = uint8_t[32]; using BRegisters = uint8_t[16]; using CRegisters = int[8]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d00, int & d01, int & d02, int & d03, int & d04, int & d05, int & d06, int & d07, uint8_t const& a00, uint8_t const& a01, uint8_t const& a02, uint8_t const& a03, uint8_t const& a04, uint8_t const& a05, uint8_t const& a06, uint8_t const& a07, uint8_t const& a08, uint8_t const& a09, uint8_t const& a10, uint8_t const& a11, uint8_t const& a12, uint8_t const& a13, uint8_t const& a14, uint8_t const& a15, uint8_t const& a16, uint8_t const& a17, uint8_t const& a18, uint8_t const& a19, uint8_t const& a20, uint8_t const& a21, uint8_t const& a22, uint8_t const& a23, uint8_t const& a24, uint8_t const& a25, uint8_t const& a26, uint8_t const& a27, uint8_t const& a28, uint8_t const& a29, uint8_t const& a30, uint8_t const& a31, uint8_t const& b00, uint8_t const& b01, uint8_t const& b02, uint8_t const& b03, uint8_t const& b04, uint8_t const& b05, uint8_t const& b06, uint8_t const& b07, uint8_t const& b08, uint8_t const& b09, uint8_t const& b10, uint8_t const& b11, uint8_t const& b12, uint8_t const& b13, uint8_t const& b14, uint8_t const& b15, int const& c00, int const& c01, int const& c02, int const& c03, int const& c04, int const& c05, int const& c06, int const& c07) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t C0, C1; intx4_t D0, D1; C0.x = c00; C0.y = c01; C0.z = c02; C0.w = c03; C1.x = c04; C1.y = c05; C1.z = c06; C1.w = c07; hytlass::Array a; a[0] = a00; a[1] = a01; a[2] = a02; a[3] = a03; a[4] = a04; a[5] = a05; a[6] = a06; a[7] = a07; a[8] = a16; a[9] = a17; a[10] = a18; a[11] = a19; a[12] = a20; a[13] = a21; a[14] = a22; a[15] = a23; hytlass::Array b; b[0] = b00; b[1] = b01; b[2] = b02; b[3] = b03; b[4] = b04; b[5] = b05; b[6] = b06; b[7] = b07; long A0, A1, B; A0 = *(reinterpret_cast(&a)); A1 = *(reinterpret_cast(&a) + 1); B = *(reinterpret_cast(&b)); D0 = __builtin_hcu_mmac_i32_16x16x32u8(A0, B, C0); D1 = __builtin_hcu_mmac_i32_16x16x32u8(A1, B, C1); a[0] = a08; a[1] = a09; a[2] = a10; a[3] = a11; a[4] = a12; a[5] = a13; a[6] = a14; a[7] = a15; a[8] = a24; a[9] = a25; a[10] = a26; a[11] = a27; a[12] = a28; a[13] = a29; a[14] = a30; a[15] = a31; b[0] = b08; b[1] = b09; b[2] = b10; b[3] = b11; b[4] = b12; b[5] = b13; b[6] = b14; b[7] = b15; A0 = *(reinterpret_cast(&a)); A1 = *(reinterpret_cast(&a) + 1); B = *(reinterpret_cast(&b)); D0 = __builtin_hcu_mmac_i32_16x16x32u8(A0, B, D0); D1 = __builtin_hcu_mmac_i32_16x16x32u8(A1, B, D1); d00 = D0.x; d01 = D0.y; d02 = D0.z; d03 = D0.w; d04 = D1.x; d05 = D1.y; d06 = D1.z; d07 = D1.w; #endif } }; struct GFX928_32x32x64_I32U8U8I32_NT { using DRegisters = int[16]; using ARegisters = uint8_t[32]; using BRegisters = uint8_t[32]; using CRegisters = int[16]; // Register asm fma HUTE_HOST_DEVICE static void fma(int & d00, int & d01, int & d02, int & d03, int & d04, int & d05, int & d06, int & d07, int & d08, int & d09, int & d10, int & d11, int & d12, int & d13, int & d14, int & d15, uint8_t const& a00, uint8_t const& a01, uint8_t const& a02, uint8_t const& a03, uint8_t const& a04, uint8_t const& a05, uint8_t const& a06, uint8_t const& a07, uint8_t const& a08, uint8_t const& a09, uint8_t const& a10, uint8_t const& a11, uint8_t const& a12, uint8_t const& a13, uint8_t const& a14, uint8_t const& a15, uint8_t const& a16, uint8_t const& a17, uint8_t const& a18, uint8_t const& a19, uint8_t const& a20, uint8_t const& a21, uint8_t const& a22, uint8_t const& a23, uint8_t const& a24, uint8_t const& a25, uint8_t const& a26, uint8_t const& a27, uint8_t const& a28, uint8_t const& a29, uint8_t const& a30, uint8_t const& a31, uint8_t const& b00, uint8_t const& b01, uint8_t const& b02, uint8_t const& b03, uint8_t const& b04, uint8_t const& b05, uint8_t const& b06, uint8_t const& b07, uint8_t const& b08, uint8_t const& b09, uint8_t const& b10, uint8_t const& b11, uint8_t const& b12, uint8_t const& b13, uint8_t const& b14, uint8_t const& b15, uint8_t const& b16, uint8_t const& b17, uint8_t const& b18, uint8_t const& b19, uint8_t const& b20, uint8_t const& b21, uint8_t const& b22, uint8_t const& b23, uint8_t const& b24, uint8_t const& b25, uint8_t const& b26, uint8_t const& b27, uint8_t const& b28, uint8_t const& b29, uint8_t const& b30, uint8_t const& b31, int const& c00, int const& c01, int const& c02, int const& c03, int const& c04, int const& c05, int const& c06, int const& c07, int const& c08, int const& c09, int const& c10, int const& c11, int const& c12, int const& c13, int const& c14, int const& c15) { #if (defined(__gfx928__) || defined(__gfx936__)) && defined(__HIPCC__) intx4_t C0, C1, C2, C3; intx4_t D0, D1, D2, D3; C0.x = c00; C0.y = c01; C0.z = c02; C0.w = c03; C1.x = c04; C1.y = c05; C1.z = c06; C1.w = c07; C2.x = c08; C2.y = c09; C2.z = c10; C2.w = c11; C3.x = c12; C3.y = c13; C3.z = c14; C3.w = c15; hytlass::Array a; a[0] = a00; a[1] = a01; a[2] = a02; a[3] = a03; a[4] = a04; a[5] = a05; a[6] = a06; a[7] = a07; a[8] = a16; a[9] = a17; a[10] = a18; a[11] = a19; a[12] = a20; a[13] = a21; a[14] = a22; a[15] = a23; hytlass::Array b; b[0] = b00; b[1] = b01; b[2] = b02; b[3] = b03; b[4] = b04; b[5] = b05; b[6] = b06; b[7] = b07; b[8] = b16; b[9] = b17; b[10] = b18; b[11] = b19; b[12] = b20; b[13] = b21; b[14] = b22; b[15] = b23; long A0, A1, B0, B1; A0 = *(reinterpret_cast(&a)); B0 = *(reinterpret_cast(&b)); A1 = *(reinterpret_cast(&a) + 1); B1 = *(reinterpret_cast(&b) + 1); D0 = __builtin_hcu_mmac_i32_16x16x32u8(A0, B0, C0); D1 = __builtin_hcu_mmac_i32_16x16x32u8(A0, B1, C1); D2 = __builtin_hcu_mmac_i32_16x16x32u8(A1, B0, C2); D3 = __builtin_hcu_mmac_i32_16x16x32u8(A1, B1, C3); a[0] = a08; a[1] = a09; a[2] = a10; a[3] = a11; a[4] = a12; a[5] = a13; a[6] = a14; a[7] = a15; a[8] = a24; a[9] = a25; a[10] = a26; a[11] = a27; a[12] = a28; a[13] = a29; a[14] = a30; a[15] = a31; b[0] = b08; b[1] = b09; b[2] = b10; b[3] = b11; b[4] = b12; b[5] = b13; b[6] = b14; b[7] = b15; b[8] = b24; b[9] = b25; b[10] = b26; b[11] = b27; b[12] = b28; b[13] = b29; b[14] = b30; b[15] = b31; A0 = *(reinterpret_cast(&a)); B0 = *(reinterpret_cast(&b)); A1 = *(reinterpret_cast(&a) + 1); B1 = *(reinterpret_cast(&b) + 1); D0 = __builtin_hcu_mmac_i32_16x16x32u8(A0, B0, D0); D1 = __builtin_hcu_mmac_i32_16x16x32u8(A0, B1, D1); D2 = __builtin_hcu_mmac_i32_16x16x32u8(A1, B0, D2); D3 = __builtin_hcu_mmac_i32_16x16x32u8(A1, B1, D3); d00 = D0.x; d01 = D0.y; d02 = D0.z; d03 = D0.w; d04 = D1.x; d05 = D1.y; d06 = D1.z; d07 = D1.w; d08 = D2.x; d09 = D2.y; d10 = D2.z; d11 = D2.w; d12 = D3.x; d13 = D3.y; d14 = D3.z; d15 = D3.w; #endif } }; //////////////////////////////////////////////////////////////////////////////////////////////////// } // end namespace hute