.text .amdgcn_target "amdgcn-amd-amdhsa--gfx938:sramecc+" .section .text._ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params,#alloc,#execinstr .protected _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params ; -- Begin function _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params .globl _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params .p2align 8 .type _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params,@function _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params: ; @_ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params ; %bb.0: s_load_dwordx2 s[0:1], s[4:5], 0x110 s_load_dwordx2 s[34:35], s[4:5], 0x0 s_lshl_b32 s2, s8, 3 s_ashr_i32 s3, s2, 31 s_lshl_b64 s[2:3], s[2:3], 2 s_waitcnt lgkmcnt(0) s_add_u32 s44, s0, s2 s_addc_u32 s45, s1, s3 s_load_dwordx4 s[12:15], s[44:45], 0x0 s_waitcnt lgkmcnt(0) s_cmp_ge_i32 s12, s34 s_cbranch_scc1 .LBB0_69 ; %bb.1: s_cmp_gt_i32 s12, s14 s_cbranch_scc1 .LBB0_69 ; %bb.2: ; %.lr.ph s_load_dwordx2 s[56:57], s[4:5], 0x68 s_load_dwordx4 s[16:19], s[4:5], 0x58 s_load_dwordx2 s[58:59], s[4:5], 0x80 s_load_dwordx4 s[8:11], s[4:5], 0x90 s_load_dwordx2 s[60:61], s[4:5], 0xb0 s_load_dwordx4 s[40:43], s[4:5], 0xc0 s_load_dwordx2 s[36:37], s[4:5], 0xe0 s_load_dwordx4 s[20:23], s[4:5], 0xf0 s_load_dwordx4 s[0:3], s[4:5], 0x140 s_load_dwordx2 s[62:63], s[4:5], 0x100 s_load_dwordx4 s[24:27], s[4:5], 0x120 s_load_dwordx4 s[28:31], s[4:5], 0xc s_load_dwordx2 s[38:39], s[4:5], 0x20 s_load_dword s33, s[44:45], 0x10 s_load_dwordx2 s[64:65], s[4:5], 0x130 s_waitcnt lgkmcnt(0) s_load_dword s0, s[0:1], 0x0 s_nop 0 s_load_dword s34, s[2:3], 0x0 s_ashr_i32 s1, s13, 31 s_lshr_b32 s1, s1, 26 s_lshl_b32 s6, s6, 6 s_add_i32 s1, s13, s1 s_ashr_i32 s2, s6, 31 s_ashr_i32 s44, s7, 31 s_ashr_i32 s13, s1, 6 s_mul_i32 s1, s10, s2 s_mul_hi_u32 s3, s10, s6 s_mul_hi_u32 s45, s42, s7 s_mul_i32 s46, s42, s44 s_add_i32 s1, s3, s1 s_mul_i32 s3, s11, s6 s_add_i32 s45, s45, s46 s_mul_i32 s43, s43, s7 s_add_i32 s1, s1, s3 s_mul_i32 s3, s10, s6 s_add_i32 s45, s45, s43 s_mul_i32 s42, s42, s7 s_add_u32 s68, s42, s3 s_addc_u32 s69, s45, s1 s_ashr_i32 s1, s30, 31 s_add_i32 s3, s30, s1 s_xor_b32 s3, s3, s1 v_cvt_f32_u32_e32 v1, s3 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v2, s0 v_mul_f32_e32 v2, s34, v2 v_mul_f32_e32 v113, s38, v2 v_rcp_iflag_f32_e32 v1, v1 s_mov_b32 s30, s8 s_sub_i32 s8, 0, s3 s_xor_b32 s0, s44, s1 v_mul_f32_e32 v1, 0x4f7ffffe, v1 v_cvt_u32_f32_e32 v1, v1 s_add_i32 s1, s7, s44 s_xor_b32 s1, s1, s44 v_mul_f32_e32 v114, s39, v2 v_readfirstlane_b32 s38, v1 s_mul_i32 s8, s8, s38 s_mul_hi_u32 s8, s38, s8 s_add_i32 s38, s38, s8 s_mul_hi_u32 s8, s1, s38 s_mul_i32 s38, s8, s3 s_sub_i32 s1, s1, s38 s_add_i32 s38, s8, 1 s_sub_i32 s39, s1, s3 s_cmp_ge_u32 s1, s3 s_cselect_b32 s8, s38, s8 ; implicit-def: $vgpr231 : SGPR spill to VGPR lane s_cselect_b32 s1, s39, s1 v_writelane_b32 v231, s9, 0 s_add_i32 s9, s8, 1 s_cmp_ge_u32 s1, s3 s_cselect_b32 s1, s9, s8 s_xor_b32 s1, s1, s0 s_sub_i32 s3, s1, s0 v_and_b32_e32 v7, 63, v0 v_and_b32_e32 v9, 60, v0 s_ashr_i32 s0, s3, 31 v_sub_u16_e32 v7, v7, v9 v_lshrrev_b16_e32 v9, 3, v0 s_mul_i32 s0, s36, s0 s_mul_hi_u32 s1, s36, s3 v_and_b32_e32 v9, 3, v9 s_add_i32 s8, s1, s0 s_load_dwordx2 s[0:1], s[4:5], 0x48 v_add_u16_e32 v7, v9, v7 v_mov_b32_e32 v9, 13 v_lshrrev_b16_sdwa v9, v9, sext(v7) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 v_and_b32_e32 v9, 3, v9 s_mul_i32 s9, s37, s3 v_and_b32_e32 v3, 15, v0 v_lshrrev_b32_e32 v4, 2, v0 v_lshrrev_b16_e32 v8, 2, v0 v_add_u16_e32 v9, v7, v9 s_add_i32 s8, s8, s9 s_mul_i32 s3, s36, s3 s_load_dwordx4 s[36:39], s[4:5], 0x28 v_lshrrev_b32_e32 v2, 8, v0 v_and_or_b32 v3, v4, 48, v3 v_and_b32_e32 v8, 15, v8 v_and_b32_e32 v9, 0xfc, v9 s_waitcnt lgkmcnt(0) s_add_u32 s71, s0, s3 s_movk_i32 s0, 0x200 v_lshlrev_b32_e32 v5, 1, v3 v_and_b32_e32 v8, 0xffff, v8 v_sub_u16_e32 v7, v7, v9 v_lshlrev_b32_e32 v9, 12, v2 s_addc_u32 s72, s1, s8 v_cmp_gt_u32_e32 vcc, s0, v0 v_add_lshl_u32 v5, v5, v2, 2 s_add_i32 s0, 0, 0xb400 v_lshlrev_b32_e32 v6, 3, v3 v_bfe_i32 v7, v7, 0, 8 v_lshl_or_b32 v8, v8, 6, v9 v_add_u32_e32 v115, s0, v5 v_add_u32_e32 v116, s0, v6 v_lshl_add_u32 v7, v7, 4, v8 s_add_i32 s0, 0, 0x400 v_add_u32_e32 v117, s0, v7 s_add_i32 s0, 0, 0x800 v_add_u32_e32 v118, s0, v7 s_add_i32 s0, 0, 0xc00 v_add_u32_e32 v1, -2, v2 v_add_u32_e32 v119, s0, v7 s_add_i32 s0, 0, 0xb000 v_cndmask_b32_e32 v1, v1, v2, vcc v_add_u32_e32 v120, s0, v5 v_bfe_u32 v5, v0, 4, 2 v_add_u32_e32 v121, s0, v6 v_lshl_or_b32 v5, v1, 6, v5 s_sub_i32 s0, s35, s6 s_mul_i32 s2, s40, s2 s_mul_hi_u32 s3, s40, s6 s_mul_hi_u32 s4, s20, s7 s_mul_i32 s5, s20, s44 v_cmp_eq_u32_e32 vcc, 0, v5 v_cmp_gt_i32_e64 s[0:1], s0, v3 s_add_i32 s2, s3, s2 s_mul_i32 s3, s41, s6 s_add_i32 s4, s4, s5 s_mul_i32 s5, s21, s7 s_sub_i32 s73, s6, s35 s_and_b64 s[66:67], vcc, s[0:1] s_add_i32 s2, s2, s3 s_mul_i32 s3, s40, s6 s_add_i32 s4, s4, s5 s_mul_i32 s5, s20, s7 s_add_u32 s3, s5, s3 v_and_b32_e32 v122, 48, v0 s_addc_u32 s76, s4, s2 s_movk_i32 s2, 0xc0 v_and_or_b32 v107, v4, s2, v122 v_and_b32_e32 v4, 12, v4 v_mov_b32_e32 v108, 0 v_writelane_b32 v231, s3, 1 v_lshrrev_b32_e32 v123, 6, v0 v_lshl_or_b32 v124, v1, 4, v4 v_and_b32_e32 v5, 0x330, v0 v_lshlrev_b32_e32 v8, 4, v0 v_mad_u64_u32 v[0:1], s[2:3], s40, v3, v[107:108] v_and_b32_e32 v126, 0xff0, v8 v_lshl_or_b32 v127, v2, 2, v126 v_mad_u64_u32 v[1:2], s[2:3], s41, v3, v[1:2] v_lshlrev_b32_e32 v6, 2, v3 v_mov_b32_e32 v2, s27 v_add_co_u32_e32 v129, vcc, s26, v6 v_addc_co_u32_e32 v130, vcc, 0, v2, vcc v_or_b32_e32 v4, s6, v3 v_add_u32_e32 v128, 0, v7 v_mov_b32_e32 v2, s19 v_add_co_u32_e32 v131, vcc, s18, v6 v_mbcnt_lo_u32_b32 v155, -1, 0 v_lshlrev_b64 v[109:110], 1, v[0:1] s_mov_b32 s75, s10 s_mov_b32 s77, s11 v_sub_u32_e32 v125, v116, v6 v_addc_co_u32_e32 v132, vcc, 0, v2, vcc s_mov_b32 s42, s60 v_xad_u32 v133, v4, -1, s35 v_or_b32_e32 v134, 1, v124 v_or_b32_e32 v135, 2, v124 v_or_b32_e32 v136, 3, v124 v_add_u32_e32 v137, 32, v124 v_add_u32_e32 v138, 33, v124 v_add_u32_e32 v139, 34, v124 v_add_u32_e32 v140, 35, v124 v_add_u32_e32 v141, 0x2000, v128 v_add_u32_e32 v142, 0x2000, v117 v_add_u32_e32 v143, 0x2000, v118 v_add_u32_e32 v144, 0x2000, v119 v_add_u32_e32 v145, 0x4000, v128 v_add_u32_e32 v146, 0x4000, v117 v_add_u32_e32 v147, 0x4000, v118 v_add_u32_e32 v148, 0x4000, v119 v_add_u32_e32 v149, 0x6000, v128 v_sub_u32_e32 v150, v121, v6 v_lshlrev_b32_e32 v151, 9, v3 v_add_u32_e32 v152, 0x6000, v117 v_add_u32_e32 v153, 0x6000, v118 v_add_u32_e32 v154, 0x6000, v119 s_brev_b32 s46, 1 s_mov_b32 s47, 0x20000 s_mov_b32 s43, 0x10000 v_mbcnt_hi_u32_b32 v156, -1, v155 s_mov_b32 s78, 0xff800000 v_cmp_eq_u32_e64 s[2:3], 0, v122 v_cmp_eq_u32_e64 s[4:5], 0, v5 s_mov_b32 s18, s12 s_mov_b32 s48, 0 s_branch .LBB0_5 .LBB0_3: ; %Flow295 ; in Loop: Header=BB0_5 Depth=1 s_or_b64 exec, exec, s[8:9] .LBB0_4: ; %_ZN5flash49compute_attn_1rowblock_splitkv_mla_fp8_gfx938_TP1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEvRK20Flash_fwd_mla_paramsiiiiiiibRT1_fff.exit ; in Loop: Header=BB0_5 Depth=1 s_add_i32 s8, s18, 1 s_cmp_lt_i32 s18, s14 s_mov_b32 s18, s8 s_cbranch_scc0 .LBB0_69 .LBB0_5: ; =>This Loop Header: Depth=1 ; Child Loop BB0_42 Depth 2 s_ashr_i32 s19, s18, 31 s_lshl_b64 s[20:21], s[18:19], 2 s_add_u32 s8, s36, s20 s_addc_u32 s9, s37, s21 global_load_dword v0, v108, s[8:9] s_cmp_le_i32 s18, s12 s_waitcnt vmcnt(0) v_readfirstlane_b32 s80, v0 s_cbranch_scc1 .LBB0_7 ; %bb.6: ; in Loop: Header=BB0_5 Depth=1 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) .LBB0_7: ; in Loop: Header=BB0_5 Depth=1 s_mul_i32 s8, s18, s57 s_mul_hi_u32 s9, s18, s56 s_add_i32 s8, s9, s8 s_mul_i32 s9, s19, s56 s_add_i32 s8, s8, s9 s_mul_i32 s9, s18, s56 s_add_u32 s9, s68, s9 s_addc_u32 s8, s69, s8 v_readfirstlane_b32 s86, v123 s_add_u32 s44, s38, s9 s_addc_u32 s45, s39, s8 s_ashr_i32 s8, s86, 31 s_lshr_b32 s8, s8, 30 s_add_i32 s8, s86, s8 s_ashr_i32 s89, s8, 2 s_and_b32 s8, s8, -4 s_sub_i32 s8, s86, s8 s_lshl_b32 s79, s89, 6 s_lshl_b32 s84, s8, 4 s_lshl_b32 s9, s89, 12 s_ashr_i32 s83, s79, 31 s_ashr_i32 s85, s84, 31 s_add_i32 s95, s9, 0 s_cmp_lg_u32 s95, -1 s_cselect_b32 s9, s95, 0 s_lshl_b32 s40, s8, 10 s_add_u32 s82, s9, s40 s_bitset1_b32 s82, 31 s_add_i32 s8, s84, s6 s_cmp_ge_i32 s8, s35 s_mov_b64 s[8:9], -1 s_cbranch_scc0 .LBB0_9 ; %bb.8: ; in Loop: Header=BB0_5 Depth=1 v_readfirstlane_b32 s8, v123 s_ashr_i32 s9, s8, 31 s_lshr_b32 s9, s9, 30 s_add_i32 s9, s8, s9 s_and_b32 s11, s9, 0x3ffffc s_sub_i32 s8, s8, s11 s_lshl_b32 s9, s9, 10 s_lshl_b32 s8, s8, 10 s_and_b32 s9, s9, 0xfffff000 s_add_i32 s8, s8, s9 s_add_i32 s8, s8, 0 v_mov_b32_e32 v0, -1 ;;#ASMSTART s_mov_b32 m0, s8 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s9, s8, 0x2000 ;;#ASMSTART s_mov_b32 m0, s9 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s9, s8, 0x4000 ;;#ASMSTART s_mov_b32 m0, s9 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s9, s8, 0x6000 ;;#ASMSTART s_mov_b32 m0, s9 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s8, s8, 0x8000 ;;#ASMSTART s_mov_b32 m0, s8 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_mov_b64 s[8:9], 0 .LBB0_9: ; %Flow314 ; in Loop: Header=BB0_5 Depth=1 s_andn2_b64 vcc, exec, s[8:9] s_add_i32 s52, s84, 16 s_cbranch_vccnz .LBB0_14 ; %bb.10: ; in Loop: Header=BB0_5 Depth=1 s_mul_i32 s8, s84, s77 s_mul_hi_u32 s9, s84, s75 s_add_i32 s8, s9, s8 s_mul_i32 s9, s85, s75 s_add_i32 s8, s8, s9 s_mul_i32 s9, s84, s75 s_add_u32 s9, s9, s79 s_addc_u32 s11, s8, s83 s_add_i32 s8, s52, s73 s_max_i32 s26, s8, 0 s_add_u32 s8, s44, s9 s_addc_u32 s9, s45, s11 s_lshl_b32 s11, s26, 8 s_bitset1_b32 s11, 16 s_nop 0 matrix_load_64x16_b8 s[8:11] s82 t r lds s_add_i32 s26, s82, 0x2000 s_nop 0 matrix_load_64x16_b8 s[8:11] s26 moffset:128 t r lds s_add_i32 s26, s82, 0x4000 s_nop 0 matrix_load_64x16_b8 s[8:11] s26 moffset:256 t r lds s_add_i32 s26, s82, 0x6000 s_nop 0 matrix_load_64x16_b8 s[8:11] s26 moffset:384 t r lds s_cmp_gt_i32 s86, 3 s_mov_b64 s[26:27], -1 s_cbranch_scc0 .LBB0_12 ; %bb.11: ; in Loop: Header=BB0_5 Depth=1 v_readfirstlane_b32 s26, v123 s_ashr_i32 s27, s26, 31 s_lshr_b32 s27, s27, 30 s_add_i32 s27, s26, s27 s_and_b32 s41, s27, 0x3ffffc s_lshl_b32 s27, s27, 10 s_sub_i32 s26, s26, s41 s_and_b32 s27, s27, 0xfffff000 s_lshl_b32 s26, s26, 10 s_add_i32 s27, s27, 0 s_add_i32 s26, s27, s26 s_add_i32 s26, s26, 0x8000 v_mov_b32_e32 v0, -1 ;;#ASMSTART s_mov_b32 m0, s26 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_mov_b64 s[26:27], 0 .LBB0_12: ; %Flow311 ; in Loop: Header=BB0_5 Depth=1 s_andn2_b64 vcc, exec, s[26:27] s_cbranch_vccnz .LBB0_14 ; %bb.13: ; in Loop: Header=BB0_5 Depth=1 s_add_i32 s26, s82, 0x8000 s_nop 0 matrix_load_64x16_b8 s[8:11] s26 moffset:512 t r lds .LBB0_14: ; %.preheader694.i ; in Loop: Header=BB0_5 Depth=1 s_cmp_eq_u32 s18, s12 s_cselect_b64 s[8:9], -1, 0 s_and_b64 s[26:27], s[8:9], exec s_cselect_b32 s81, s13, 0 s_cmp_eq_u32 s18, s14 s_cselect_b32 s11, s15, s80 s_add_i32 s11, s11, 63 s_ashr_i32 s26, s11, 31 s_lshr_b32 s26, s26, 26 s_add_i32 s11, s11, s26 s_ashr_i32 s11, s11, 6 s_add_i32 s26, s40, 0 ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND s_mov_b32 m0, s26 s_nop 0 ds_read_matrix_trans_format v[0:3], m0 element:1 row:3 col:1 ds_read_matrix_trans_format v[4:7], m0 offset:4096 element:1 row:3 col:1 ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[8:11], m0 offset:8192 element:1 row:3 col:1 ds_read_matrix_trans_format v[12:15], m0 offset:12288 element:1 row:3 col:1 ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[16:19], m0 offset:16384 element:1 row:3 col:1 ds_read_matrix_trans_format v[20:23], m0 offset:20480 element:1 row:3 col:1 ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[24:27], m0 offset:24576 element:1 row:3 col:1 ds_read_matrix_trans_format v[28:31], m0 offset:28672 element:1 row:3 col:1 ;;#ASMSTART s_waitcnt vmcnt(0) s_barrier ;;#ASMEND s_mul_i32 s26, s18, s63 s_mul_hi_u32 s27, s18, s62 s_add_i32 s26, s27, s26 s_mul_i32 s27, s19, s62 ds_read_matrix_trans_format v[32:35], m0 offset:32768 element:1 row:3 col:1 s_add_i32 s27, s26, s27 s_mul_i32 s26, s18, s62 s_lshl_b64 s[26:27], s[26:27], 2 s_add_u32 s88, s22, s26 s_addc_u32 s87, s23, s27 s_add_i32 s40, s11, -1 s_cmp_le_i32 s11, s81 v_mov_b32_e32 v157, 0 s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_cbranch_scc1 .LBB0_40 ; %bb.15: ; %.lr.ph.i ; in Loop: Header=BB0_5 Depth=1 s_cmp_gt_i32 s86, 3 s_cselect_b64 s[26:27], -1, 0 s_ashr_i32 s41, s40, 31 s_lshl_b64 s[44:45], s[40:41], 2 s_add_u32 s44, s88, s44 s_addc_u32 s45, s87, s45 ;;#ASMSTART s_load_dword s41, s[44:45], 0x0 s_waitcnt lgkmcnt(0) ;;#ASMEND s_ashr_i32 s44, s41, 31 s_mul_i32 s45, s41, s59 s_mul_hi_u32 s49, s41, s58 s_add_i32 s45, s49, s45 s_mul_i32 s44, s44, s58 s_add_i32 s45, s45, s44 s_mul_i32 s41, s41, s58 s_add_u32 s44, s71, s41 s_mul_i32 s41, s84, s61 s_mul_hi_u32 s49, s84, s60 s_addc_u32 s45, s72, s45 s_add_i32 s41, s49, s41 s_mul_i32 s49, s85, s60 s_add_i32 s49, s41, s49 s_mul_i32 s41, s84, s60 s_add_u32 s41, s41, s79 s_addc_u32 s91, s49, s83 s_lshl_b32 s74, s40, 6 s_add_i32 s49, s84, s74 s_cmp_lt_i32 s49, s80 s_mov_b64 s[50:51], -1 s_cbranch_scc1 .LBB0_17 ; %bb.16: ; in Loop: Header=BB0_5 Depth=1 v_readfirstlane_b32 s49, v123 s_ashr_i32 s50, s49, 31 s_lshr_b32 s50, s50, 30 s_add_i32 s50, s49, s50 s_and_b32 s51, s50, 0x3ffffc s_sub_i32 s49, s49, s51 s_lshl_b32 s50, s50, 10 s_lshl_b32 s49, s49, 10 s_and_b32 s50, s50, 0xfffff000 s_add_i32 s49, s49, s50 s_add_i32 s49, s49, 0 v_mov_b32_e32 v36, -1 ;;#ASMSTART s_mov_b32 m0, s49 buffer_load_dwordx4 v36, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s50, s49, 0x2000 ;;#ASMSTART s_mov_b32 m0, s50 buffer_load_dwordx4 v36, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s50, s49, 0x4000 ;;#ASMSTART s_mov_b32 m0, s50 buffer_load_dwordx4 v36, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s50, s49, 0x6000 ;;#ASMSTART s_mov_b32 m0, s50 buffer_load_dwordx4 v36, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s49, s49, 0x8000 ;;#ASMSTART s_mov_b32 m0, s49 buffer_load_dwordx4 v36, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_mov_b64 s[50:51], 0 .LBB0_17: ; %Flow308 ; in Loop: Header=BB0_5 Depth=1 s_sub_i32 s70, s52, s80 s_add_i32 s92, s82, 0x2000 s_add_i32 s93, s82, 0x4000 s_add_i32 s94, s82, 0x6000 s_andn2_b64 vcc, exec, s[50:51] s_add_i32 s90, s82, 0x8000 s_cbranch_vccnz .LBB0_22 ; %bb.18: ; in Loop: Header=BB0_5 Depth=1 s_add_i32 s49, s70, s74 s_max_i32 s49, s49, 0 s_add_u32 s52, s44, s41 s_addc_u32 s53, s45, s91 s_lshl_b32 s49, s49, 8 s_or_b32 s55, s49, 0x10000 s_mov_b32 s54, s42 s_nop 0 matrix_load_64x16_b8 s[52:55] s82 t r lds matrix_load_64x16_b8 s[52:55] s92 moffset:128 t r lds matrix_load_64x16_b8 s[52:55] s93 moffset:256 t r lds matrix_load_64x16_b8 s[52:55] s94 moffset:384 t r lds s_andn2_b64 vcc, exec, s[26:27] s_mov_b64 s[50:51], -1 s_cbranch_vccnz .LBB0_20 ; %bb.19: ; in Loop: Header=BB0_5 Depth=1 v_readfirstlane_b32 s49, v123 s_ashr_i32 s50, s49, 31 s_lshr_b32 s50, s50, 30 s_add_i32 s50, s49, s50 s_and_b32 s51, s50, 0x3ffffc s_lshl_b32 s50, s50, 10 s_sub_i32 s49, s49, s51 s_and_b32 s50, s50, 0xfffff000 s_lshl_b32 s49, s49, 10 s_add_i32 s50, s50, 0 s_add_i32 s49, s50, s49 s_add_i32 s49, s49, 0x8000 v_mov_b32_e32 v36, -1 ;;#ASMSTART s_mov_b32 m0, s49 buffer_load_dwordx4 v36, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_mov_b64 s[50:51], 0 .LBB0_20: ; %Flow305 ; in Loop: Header=BB0_5 Depth=1 s_andn2_b64 vcc, exec, s[50:51] s_cbranch_vccnz .LBB0_22 ; %bb.21: ; in Loop: Header=BB0_5 Depth=1 matrix_load_64x16_b8 s[52:55] s90 moffset:512 t r lds .LBB0_22: ; in Loop: Header=BB0_5 Depth=1 s_mul_i32 s44, s89, 0xfffff400 s_add_i32 s95, s95, s44 ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND s_mov_b32 m0, s95 s_nop 0 ds_read_matrix_trans_format v[36:39], m0 element:1 row:3 col:1 s_mov_b32 s49, s48 s_mov_b32 s50, s48 s_mov_b32 s51, s48 v_mov_b64_e32 v[44:45], s[48:49] v_mov_b64_e32 v[46:47], s[50:51] v_mov_b64_e32 v[50:51], v[46:47] v_mov_b64_e32 v[48:49], v[44:45] ds_read_matrix_trans_format v[40:43], m0 offset:2048 element:1 row:3 col:1 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[0:1], v[36:37], v[48:51] lit ds_read_matrix_trans_format v[52:55], m0 offset:4096 element:1 row:3 col:1 v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[2:3], v[38:39], v[48:51] lit ds_read_matrix_trans_format v[36:39], m0 offset:6144 element:1 row:3 col:1 s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[0:1], v[40:41], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[2:3], v[42:43], v[44:47] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[4:5], v[52:53], v[48:51] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[6:7], v[54:55], v[48:51] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[4:5], v[36:37], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[6:7], v[38:39], v[44:47] lit ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[36:39], m0 offset:8192 element:1 row:3 col:1 ds_read_matrix_trans_format v[40:43], m0 offset:10240 element:1 row:3 col:1 ds_read_matrix_trans_format v[52:55], m0 offset:12288 element:1 row:3 col:1 ds_read_matrix_trans_format v[56:59], m0 offset:14336 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[8:9], v[36:37], v[48:51] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[8:9], v[40:41], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[10:11], v[42:43], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[10:11], v[38:39], v[48:51] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[12:13], v[56:57], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[12:13], v[52:53], v[48:51] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[14:15], v[58:59], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[14:15], v[54:55], v[48:51] lit ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[36:39], m0 offset:16384 element:1 row:3 col:1 ds_read_matrix_trans_format v[40:43], m0 offset:18432 element:1 row:3 col:1 ds_read_matrix_trans_format v[52:55], m0 offset:20480 element:1 row:3 col:1 ds_read_matrix_trans_format v[56:59], m0 offset:22528 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[16:17], v[36:37], v[48:51] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[16:17], v[40:41], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[18:19], v[42:43], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[18:19], v[38:39], v[48:51] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[20:21], v[56:57], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[20:21], v[52:53], v[48:51] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[22:23], v[58:59], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[22:23], v[54:55], v[48:51] lit ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[36:39], m0 offset:24576 element:1 row:3 col:1 ds_read_matrix_trans_format v[40:43], m0 offset:26624 element:1 row:3 col:1 ds_read_matrix_trans_format v[52:55], m0 offset:28672 element:1 row:3 col:1 ds_read_matrix_trans_format v[56:59], m0 offset:30720 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[24:25], v[36:37], v[48:51] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[24:25], v[40:41], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[26:27], v[42:43], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[26:27], v[38:39], v[48:51] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[28:29], v[56:57], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[28:29], v[52:53], v[48:51] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[30:31], v[58:59], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[30:31], v[54:55], v[48:51] lit ;;#ASMSTART s_waitcnt vmcnt(0) s_barrier ;;#ASMEND s_ashr_i32 s44, s31, 31 s_add_i32 s45, s44, s31 s_xor_b32 s45, s45, s44 v_cvt_f32_u32_e32 v36, s45 v_ashrrev_i32_e32 v54, 31, v133 v_add_u32_e32 v57, v54, v133 v_xor_b32_e32 v56, s44, v54 v_rcp_iflag_f32_e32 v36, v36 v_xor_b32_e32 v54, v57, v54 s_not_b32 s49, s74 s_add_i32 s49, s49, s80 v_mul_f32_e32 v36, 0x4f7ffffe, v36 v_cvt_u32_f32_e32 v52, v36 ds_read_matrix_trans_format v[36:39], m0 offset:32768 element:1 row:3 col:1 ds_read_matrix_trans_format v[40:43], m0 offset:34816 element:1 row:3 col:1 v_mov_b32_e32 v55, 0xff800000 v_mul_lo_u32 v53, s45, v52 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[32:33], v[36:37], v[48:51] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[34:35], v[38:39], v[48:51] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[32:33], v[40:41], v[44:47] lit v_sub_u32_e32 v53, 0, v53 v_mul_hi_u32 v53, v53, v52 v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[34:35], v[42:43], v[44:47] lit v_add_u32_e32 v52, v53, v52 v_mul_hi_u32 v52, v52, v54 v_mul_lo_u32 v36, v52, s45 v_add_u32_e32 v37, 1, v52 v_sub_u32_e32 v36, v54, v36 v_cmp_le_u32_e32 vcc, s45, v36 v_subrev_u32_e32 v38, s45, v36 v_cndmask_b32_e32 v37, v52, v37, vcc v_cndmask_b32_e32 v36, v36, v38, vcc v_add_u32_e32 v38, 1, v37 v_cmp_le_u32_e32 vcc, s45, v36 v_cndmask_b32_e32 v36, v37, v38, vcc v_xor_b32_e32 v36, v36, v56 v_sub_u32_e32 v104, v36, v56 v_sub_u32_e32 v36, s49, v104 v_cmp_le_i32_e32 vcc, v124, v36 v_cndmask_b32_e32 v43, v55, v48, vcc v_cmp_le_i32_e32 vcc, v134, v36 v_cndmask_b32_e32 v42, v55, v49, vcc v_cmp_le_i32_e32 vcc, v135, v36 v_cndmask_b32_e32 v41, v55, v50, vcc v_cmp_le_i32_e32 vcc, v136, v36 v_cndmask_b32_e32 v40, v55, v51, vcc v_cmp_le_i32_e32 vcc, v137, v36 v_cndmask_b32_e32 v39, v55, v44, vcc v_cmp_le_i32_e32 vcc, v138, v36 v_cndmask_b32_e32 v38, v55, v45, vcc v_cmp_le_i32_e32 vcc, v139, v36 v_cndmask_b32_e32 v37, v55, v46, vcc v_cmp_le_i32_e32 vcc, v140, v36 v_cndmask_b32_e32 v36, v55, v47, vcc v_and_b32_e32 v45, 63, v156 v_and_b32_e32 v47, 64, v156 v_max_f32_e32 v44, v43, v42 v_xor_b32_e32 v46, 32, v45 v_add_u32_e32 v47, 64, v47 v_max3_f32 v44, v44, v41, v40 v_cmp_lt_i32_e32 vcc, v46, v47 v_max3_f32 v44, v44, v39, v38 v_cndmask_b32_e32 v46, v156, v46, vcc v_max3_f32 v44, v44, v37, v36 v_lshlrev_b32_e32 v102, 2, v46 ds_bpermute_b32 v46, v102, v44 v_xor_b32_e32 v45, 16, v45 v_cmp_lt_i32_e32 vcc, v45, v47 v_cndmask_b32_e32 v45, v156, v45, vcc v_lshlrev_b32_e32 v103, 2, v45 s_waitcnt lgkmcnt(0) v_max_f32_e32 v44, v44, v46 ds_bpermute_b32 v45, v103, v44 s_and_saveexec_b64 s[44:45], s[2:3] s_cbranch_execz .LBB0_24 ; %bb.23: ; in Loop: Header=BB0_5 Depth=1 s_waitcnt lgkmcnt(0) v_max_f32_e32 v44, v44, v45 ds_write_b32 v115, v44 .LBB0_24: ; in Loop: Header=BB0_5 Depth=1 s_or_b64 exec, exec, s[44:45] s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[44:45], s[4:5] s_cbranch_execz .LBB0_26 ; %bb.25: ; in Loop: Header=BB0_5 Depth=1 ds_read_b64 v[44:45], v116 s_waitcnt lgkmcnt(0) v_max_f32_e32 v44, v44, v45 ds_write_b32 v125, v44 offset:512 .LBB0_26: ; %_ZN5flash7SoftmaxILi1EE25softmax_rescale_o_fp8_tp1ILb1ELb1ELb1EN4cute6TensorINS3_13array_alignedIfLm8ELm16EEENS3_6LayoutINS3_5tupleIJNS3_1CILi4EEENS9_ILi1EEENS9_ILi2EEEEEENS8_IJSB_NS9_ILi0EEESA_EEEEEEENS4_INS3_10ViewEngineINS3_8smem_ptrIfEEEENS7_INS8_IJNS9_ILi256EEEEEENS8_IJSB_EEEEEEEEEvRT2_RT3_fPDv4_f.exit.peel.i ; in Loop: Header=BB0_5 Depth=1 s_or_b64 exec, exec, s[44:45] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v111, v125 offset:512 s_waitcnt lgkmcnt(0) v_mul_f32_e32 v44, v111, v114 v_cmp_lg_f32_e32 vcc, s78, v111 v_cndmask_b32_e32 v44, 0, v44, vcc v_fma_f32 v43, v43, v114, -v44 v_fma_f32 v42, v42, v114, -v44 v_exp_f32_e32 v43, v43 v_exp_f32_e32 v42, v42 v_fma_f32 v41, v41, v114, -v44 v_fma_f32 v40, v40, v114, -v44 v_exp_f32_e32 v100, v41 v_exp_f32_e32 v40, v40 v_fma_f32 v39, v39, v114, -v44 v_fma_f32 v38, v38, v114, -v44 v_exp_f32_e32 v39, v39 v_fma_f32 v37, v37, v114, -v44 v_exp_f32_e32 v38, v38 v_exp_f32_e32 v101, v37 v_add_f32_e32 v37, v42, v43 v_fma_f32 v36, v36, v114, -v44 v_add_f32_e32 v37, v37, v100 v_exp_f32_e32 v36, v36 v_add_f32_e32 v37, v37, v40 v_add_f32_e32 v37, v37, v39 v_add_f32_e32 v37, v37, v38 v_add_f32_e32 v37, v37, v101 v_add_f32_e32 v157, v37, v36 ; sched_barrier mask(0x00000000) v_cvt_pk_fp8_f32 v43, v43, v42, s0 v_cvt_pk_fp8_f32 v39, v39, v38, s0 v_cvt_pk_fp8_f32 v100, v100, v40, v43 op_sel:[0,0,0,1] v_cvt_pk_fp8_f32 v101, v101, v36, v39 op_sel:[0,0,0,1] v_add_u32_e32 v36, 0, v127 v_add_u32_e32 v36, 0xa000, v36 ds_write2_b32 v36, v100, v101 offset1:2 v_add_u32_e32 v36, 0, v126 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b128 v[158:161], v36 offset:40960 ; sched_barrier mask(0x00000000) ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[54:56:58:60], v128 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[55:57:59:61], v117 ds_read_m64x16_b8_alt4 v[62:64:66:68], v118 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[63:65:67:69], v119 ds_read_m64x16_b8_alt4 v[70:72:74:76], v141 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[71:73:75:77], v142 ds_read_m64x16_b8_alt4 v[78:80:82:84], v143 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[79:81:83:85], v144 ds_read_m64x16_b8_alt4 v[86:88:90:92], v145 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[87:89:91:93], v146 ds_read_m64x16_b8_alt4 v[162:164:166:168], v147 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[163:165:167:169], v148 ds_read_m64x16_b8_alt4 v[170:172:174:176], v149 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[171:173:175:177], v152 ds_read_m64x16_b8_alt4 v[178:180:182:184], v153 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[179:181:183:185], v154 ; sched_barrier mask(0x00000000) s_mov_b32 s49, s48 s_mov_b32 s50, s48 s_mov_b32 s51, s48 v_mov_b64_e32 v[36:37], s[48:49] v_mov_b64_e32 v[38:39], s[50:51] v_mov_b32_e32 v40, v54 v_mov_b32_e32 v41, v55 v_mov_b64_e32 v[54:55], v[38:39] v_mov_b64_e32 v[52:53], v[36:37] v_mov_b64_e32 v[50:51], v[38:39] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[52:55], v[158:159], v[40:41], v[52:55] lit v_mov_b32_e32 v40, v56 v_mov_b32_e32 v41, v57 v_mov_b64_e32 v[48:49], v[36:37] v_mov_b64_e32 v[46:47], v[38:39] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[158:159], v[40:41], v[48:51] lit v_mov_b32_e32 v40, v58 v_mov_b32_e32 v41, v59 v_mov_b64_e32 v[44:45], v[36:37] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[158:159], v[40:41], v[44:47] lit v_mov_b64_e32 v[42:43], v[38:39] v_mov_b32_e32 v56, v60 v_mov_b32_e32 v57, v61 v_mov_b64_e32 v[40:41], v[36:37] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[40:43], v[158:159], v[56:57], v[40:43] lit v_mov_b32_e32 v56, v62 v_mov_b32_e32 v57, v63 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[52:55], v[160:161], v[56:57], v[52:55] lit v_mov_b32_e32 v56, v64 v_mov_b32_e32 v57, v65 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[160:161], v[56:57], v[48:51] lit v_mov_b32_e32 v56, v66 v_mov_b32_e32 v57, v67 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[160:161], v[56:57], v[44:47] lit v_mov_b32_e32 v56, v68 v_mov_b32_e32 v57, v69 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[40:43], v[160:161], v[56:57], v[40:43] lit v_mov_b32_e32 v56, v70 v_mov_b32_e32 v57, v71 v_mov_b64_e32 v[70:71], v[38:39] v_mov_b64_e32 v[68:69], v[36:37] v_mov_b64_e32 v[66:67], v[38:39] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[68:71], v[158:159], v[56:57], v[68:71] lit v_mov_b32_e32 v56, v72 v_mov_b32_e32 v57, v73 v_mov_b64_e32 v[64:65], v[36:37] v_mov_b64_e32 v[62:63], v[38:39] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[64:67], v[158:159], v[56:57], v[64:67] lit v_mov_b32_e32 v56, v74 v_mov_b32_e32 v57, v75 v_mov_b64_e32 v[60:61], v[36:37] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[60:63], v[158:159], v[56:57], v[60:63] lit v_mov_b64_e32 v[58:59], v[38:39] v_mov_b32_e32 v72, v76 v_mov_b32_e32 v73, v77 v_mov_b64_e32 v[56:57], v[36:37] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[56:59], v[158:159], v[72:73], v[56:59] lit v_mov_b32_e32 v72, v78 v_mov_b32_e32 v73, v79 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[68:71], v[160:161], v[72:73], v[68:71] lit v_mov_b32_e32 v72, v80 v_mov_b32_e32 v73, v81 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[64:67], v[160:161], v[72:73], v[64:67] lit v_mov_b32_e32 v72, v82 v_mov_b32_e32 v73, v83 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[60:63], v[160:161], v[72:73], v[60:63] lit v_mov_b32_e32 v72, v84 v_mov_b32_e32 v73, v85 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[56:59], v[160:161], v[72:73], v[56:59] lit v_mov_b32_e32 v72, v86 v_mov_b32_e32 v73, v87 v_mov_b64_e32 v[86:87], v[38:39] v_mov_b64_e32 v[84:85], v[36:37] v_mov_b64_e32 v[82:83], v[38:39] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[84:87], v[158:159], v[72:73], v[84:87] lit v_mov_b32_e32 v72, v88 v_mov_b32_e32 v73, v89 v_mov_b64_e32 v[80:81], v[36:37] v_mov_b64_e32 v[78:79], v[38:39] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[80:83], v[158:159], v[72:73], v[80:83] lit v_mov_b32_e32 v72, v90 v_mov_b32_e32 v73, v91 v_mov_b64_e32 v[76:77], v[36:37] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[76:79], v[158:159], v[72:73], v[76:79] lit v_mov_b64_e32 v[74:75], v[38:39] v_mov_b32_e32 v88, v92 v_mov_b32_e32 v89, v93 v_mov_b64_e32 v[72:73], v[36:37] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[72:75], v[158:159], v[88:89], v[72:75] lit v_mov_b32_e32 v88, v162 v_mov_b32_e32 v89, v163 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[84:87], v[160:161], v[88:89], v[84:87] lit v_mov_b32_e32 v88, v164 v_mov_b32_e32 v89, v165 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[80:83], v[160:161], v[88:89], v[80:83] lit v_mov_b32_e32 v88, v166 v_mov_b32_e32 v89, v167 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[76:79], v[160:161], v[88:89], v[76:79] lit v_mov_b32_e32 v88, v168 v_mov_b32_e32 v89, v169 v_mov_b64_e32 v[98:99], v[38:39] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[72:75], v[160:161], v[88:89], v[72:75] lit v_mov_b32_e32 v88, v170 v_mov_b32_e32 v89, v171 v_mov_b64_e32 v[96:97], v[36:37] v_mov_b64_e32 v[94:95], v[38:39] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[96:99], v[158:159], v[88:89], v[96:99] lit v_mov_b32_e32 v88, v172 v_mov_b32_e32 v89, v173 v_mov_b64_e32 v[92:93], v[36:37] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[92:95], v[158:159], v[88:89], v[92:95] lit v_mov_b64_e32 v[90:91], v[38:39] v_mov_b32_e32 v105, v174 v_mov_b32_e32 v106, v175 v_mov_b64_e32 v[88:89], v[36:37] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[88:91], v[158:159], v[105:106], v[88:91] lit v_mov_b32_e32 v105, v176 v_mov_b32_e32 v106, v177 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[36:39], v[158:159], v[105:106], v[36:39] lit v_mov_b32_e32 v105, v178 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v106, v179 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[96:99], v[160:161], v[105:106], v[96:99] lit v_mov_b32_e32 v105, v180 v_mov_b32_e32 v106, v181 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[92:95], v[160:161], v[105:106], v[92:95] lit v_mov_b32_e32 v105, v182 v_mov_b32_e32 v106, v183 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[88:91], v[160:161], v[105:106], v[88:91] lit v_mov_b32_e32 v105, v184 v_mov_b32_e32 v106, v185 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[36:39], v[160:161], v[105:106], v[36:39] lit ;;#ASMSTART s_barrier ;;#ASMEND s_add_i32 s44, s11, -2 s_cmp_le_i32 s40, s81 s_cbranch_scc1 .LBB0_36 ; %bb.27: ; %.peel.next.i ; in Loop: Header=BB0_5 Depth=1 s_ashr_i32 s45, s44, 31 s_lshl_b64 s[50:51], s[44:45], 2 s_add_u32 s50, s88, s50 s_addc_u32 s51, s87, s51 s_lshl_b32 s49, s44, 6 s_add_i32 s70, s70, s49 s_max_i32 s40, s70, 0 s_lshl_b32 s40, s40, 8 s_or_b32 s55, s40, 0x10000 ;;#ASMSTART s_load_dword s40, s[50:51], 0x0 s_waitcnt lgkmcnt(0) ;;#ASMEND s_ashr_i32 s44, s40, 31 s_mul_i32 s45, s40, s59 s_mul_hi_u32 s50, s40, s58 s_add_i32 s45, s50, s45 s_mul_i32 s44, s44, s58 s_add_i32 s45, s45, s44 s_mul_i32 s40, s40, s58 s_add_u32 s44, s71, s40 s_addc_u32 s45, s72, s45 s_add_u32 s52, s44, s41 s_addc_u32 s53, s45, s91 s_mov_b32 s54, s42 s_nop 0 matrix_load_64x16_b8 s[52:55] s82 t r lds matrix_load_64x16_b8 s[52:55] s92 moffset:128 t r lds matrix_load_64x16_b8 s[52:55] s93 moffset:256 t r lds matrix_load_64x16_b8 s[52:55] s94 moffset:384 t r lds s_mov_b64 s[40:41], -1 s_and_b64 vcc, exec, s[26:27] s_cbranch_vccz .LBB0_29 ; %bb.28: ; in Loop: Header=BB0_5 Depth=1 v_readfirstlane_b32 s26, v123 s_ashr_i32 s27, s26, 31 s_lshr_b32 s27, s27, 30 s_add_i32 s27, s26, s27 s_and_b32 s40, s27, 0x3ffffc s_lshl_b32 s27, s27, 10 s_sub_i32 s26, s26, s40 s_and_b32 s27, s27, 0xfffff000 s_lshl_b32 s26, s26, 10 s_add_i32 s27, s27, 0 s_add_i32 s26, s27, s26 s_add_i32 s26, s26, 0x8000 v_mov_b32_e32 v105, -1 ;;#ASMSTART s_mov_b32 m0, s26 buffer_load_dwordx4 v105, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_mov_b64 s[40:41], 0 .LBB0_29: ; %Flow303 ; in Loop: Header=BB0_5 Depth=1 s_andn2_b64 vcc, exec, s[40:41] v_sub_u32_e32 v104, 0, v104 s_cbranch_vccnz .LBB0_31 ; %bb.30: ; in Loop: Header=BB0_5 Depth=1 matrix_load_64x16_b8 s[52:55] s90 moffset:512 t r lds .LBB0_31: ; in Loop: Header=BB0_5 Depth=1 s_not_b32 s26, s49 ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND s_mov_b32 m0, s95 s_nop 0 ds_read_matrix_trans_format v[158:161], m0 element:1 row:3 col:1 s_mov_b32 s50, s48 s_mov_b32 s51, s48 s_mov_b32 s49, s48 v_mov_b64_e32 v[164:165], s[50:51] v_mov_b64_e32 v[162:163], s[48:49] v_mov_b64_e32 v[168:169], v[164:165] v_mov_b64_e32 v[166:167], v[162:163] ds_read_matrix_trans_format v[170:173], m0 offset:2048 element:1 row:3 col:1 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[0:1], v[158:159], v[166:169] lit ds_read_matrix_trans_format v[174:177], m0 offset:4096 element:1 row:3 col:1 v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[2:3], v[160:161], v[166:169] lit ds_read_matrix_trans_format v[158:161], m0 offset:6144 element:1 row:3 col:1 s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[0:1], v[170:171], v[162:165] lit v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[2:3], v[172:173], v[162:165] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[4:5], v[174:175], v[166:169] lit v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[6:7], v[176:177], v[166:169] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[4:5], v[158:159], v[162:165] lit v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[6:7], v[160:161], v[162:165] lit ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[158:161], m0 offset:8192 element:1 row:3 col:1 ds_read_matrix_trans_format v[170:173], m0 offset:10240 element:1 row:3 col:1 ds_read_matrix_trans_format v[174:177], m0 offset:12288 element:1 row:3 col:1 ds_read_matrix_trans_format v[178:181], m0 offset:14336 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[8:9], v[158:159], v[166:169] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[8:9], v[170:171], v[162:165] lit v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[10:11], v[160:161], v[166:169] lit v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[10:11], v[172:173], v[162:165] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[12:13], v[174:175], v[166:169] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[12:13], v[178:179], v[162:165] lit v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[14:15], v[176:177], v[166:169] lit v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[14:15], v[180:181], v[162:165] lit ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[158:161], m0 offset:16384 element:1 row:3 col:1 ds_read_matrix_trans_format v[170:173], m0 offset:18432 element:1 row:3 col:1 ds_read_matrix_trans_format v[174:177], m0 offset:20480 element:1 row:3 col:1 ds_read_matrix_trans_format v[178:181], m0 offset:22528 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[16:17], v[158:159], v[166:169] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[16:17], v[170:171], v[162:165] lit v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[18:19], v[160:161], v[166:169] lit v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[18:19], v[172:173], v[162:165] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[20:21], v[174:175], v[166:169] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[20:21], v[178:179], v[162:165] lit v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[22:23], v[176:177], v[166:169] lit v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[22:23], v[180:181], v[162:165] lit ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[158:161], m0 offset:24576 element:1 row:3 col:1 ds_read_matrix_trans_format v[170:173], m0 offset:26624 element:1 row:3 col:1 ds_read_matrix_trans_format v[174:177], m0 offset:28672 element:1 row:3 col:1 ds_read_matrix_trans_format v[178:181], m0 offset:30720 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[24:25], v[158:159], v[166:169] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[24:25], v[170:171], v[162:165] lit v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[26:27], v[160:161], v[166:169] lit v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[26:27], v[172:173], v[162:165] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[28:29], v[174:175], v[166:169] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[28:29], v[178:179], v[162:165] lit v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[30:31], v[176:177], v[166:169] lit v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[30:31], v[180:181], v[162:165] lit ;;#ASMSTART s_waitcnt vmcnt(0) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[158:161], m0 offset:32768 element:1 row:3 col:1 ds_read_matrix_trans_format v[170:173], m0 offset:34816 element:1 row:3 col:1 s_add_i32 s26, s26, s80 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[32:33], v[158:159], v[166:169] lit v_mmac_f32_16x16x32_fp8_fp8 v[166:169], v[34:35], v[160:161], v[166:169] lit v_add_u32_e32 v160, s26, v104 v_mov_b32_e32 v161, 0xff800000 v_cmp_le_i32_e32 vcc, v124, v160 v_cndmask_b32_e32 v104, v161, v166, vcc v_cmp_le_i32_e32 vcc, v134, v160 v_cndmask_b32_e32 v105, v161, v167, vcc v_cmp_le_i32_e32 vcc, v135, v160 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[32:33], v[170:171], v[162:165] lit v_cndmask_b32_e32 v106, v161, v168, vcc v_cmp_le_i32_e32 vcc, v136, v160 v_mmac_f32_16x16x32_fp8_fp8 v[162:165], v[34:35], v[172:173], v[162:165] lit v_cndmask_b32_e32 v107, v161, v169, vcc v_cmp_le_i32_e32 vcc, v137, v160 s_nop 0 v_cndmask_b32_e32 v112, v161, v162, vcc v_cmp_le_i32_e32 vcc, v138, v160 v_cndmask_b32_e32 v158, v161, v163, vcc v_cmp_le_i32_e32 vcc, v139, v160 v_cndmask_b32_e32 v159, v161, v164, vcc v_cmp_le_i32_e32 vcc, v140, v160 v_cndmask_b32_e32 v160, v161, v165, vcc v_max3_f32 v161, v111, v104, v105 v_max3_f32 v161, v161, v106, v107 v_max3_f32 v161, v161, v112, v158 v_max3_f32 v161, v161, v159, v160 ds_bpermute_b32 v102, v102, v161 s_waitcnt lgkmcnt(0) v_max_f32_e32 v102, v161, v102 ds_bpermute_b32 v103, v103, v102 s_and_saveexec_b64 s[26:27], s[2:3] s_cbranch_execz .LBB0_33 ; %bb.32: ; in Loop: Header=BB0_5 Depth=1 s_waitcnt lgkmcnt(0) v_max_f32_e32 v102, v102, v103 ds_write_b32 v115, v102 .LBB0_33: ; in Loop: Header=BB0_5 Depth=1 s_or_b64 exec, exec, s[26:27] s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[26:27], s[4:5] s_cbranch_execz .LBB0_35 ; %bb.34: ; in Loop: Header=BB0_5 Depth=1 ds_read_b64 v[102:103], v116 s_waitcnt lgkmcnt(0) v_max_f32_e32 v102, v102, v103 ds_write_b32 v125, v102 offset:512 .LBB0_35: ; %_ZN5flash7SoftmaxILi1EE25softmax_rescale_o_fp8_tp1ILb0ELb1ELb1EN4cute6TensorINS3_13array_alignedIfLm8ELm16EEENS3_6LayoutINS3_5tupleIJNS3_1CILi4EEENS9_ILi1EEENS9_ILi2EEEEEENS8_IJSB_NS9_ILi0EEESA_EEEEEEENS4_INS3_10ViewEngineINS3_8smem_ptrIfEEEENS7_INS8_IJNS9_ILi256EEEEEENS8_IJSB_EEEEEEEEEvRT2_RT3_fPDv4_f.exit.i ; in Loop: Header=BB0_5 Depth=1 s_or_b64 exec, exec, s[26:27] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v102, v125 offset:512 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, s78, v102 v_cndmask_b32_e64 v103, v102, 0, vcc v_sub_f32_e32 v103, v111, v103 v_mul_f32_e32 v103, v103, v114 s_nop 0 v_exp_f32_e32 v161, v103 v_mul_f32_e32 v103, v102, v114 v_cndmask_b32_e64 v103, v103, 0, vcc v_fma_f32 v104, v104, v114, -v103 v_fma_f32 v105, v105, v114, -v103 v_exp_f32_e32 v104, v104 v_exp_f32_e32 v105, v105 v_fma_f32 v106, v106, v114, -v103 v_fma_f32 v107, v107, v114, -v103 v_exp_f32_e32 v106, v106 v_exp_f32_e32 v107, v107 v_fma_f32 v111, v112, v114, -v103 v_fma_f32 v112, v158, v114, -v103 v_exp_f32_e32 v111, v111 v_exp_f32_e32 v112, v112 v_fma_f32 v158, v159, v114, -v103 v_add_f32_e32 v159, v105, v104 v_exp_f32_e32 v158, v158 v_fma_f32 v103, v160, v114, -v103 v_add_f32_e32 v159, v159, v106 v_exp_f32_e32 v103, v103 v_add_f32_e32 v159, v159, v107 v_add_f32_e32 v159, v159, v111 v_add_f32_e32 v159, v159, v112 v_mov_b32_e32 v162, v161 v_add_f32_e32 v159, v159, v158 v_pk_mul_f32 v[52:53], v[161:162], v[52:53] v_pk_mul_f32 v[54:55], v[161:162], v[54:55] v_pk_mul_f32 v[48:49], v[161:162], v[48:49] v_pk_mul_f32 v[50:51], v[161:162], v[50:51] v_pk_mul_f32 v[44:45], v[161:162], v[44:45] v_pk_mul_f32 v[46:47], v[161:162], v[46:47] v_pk_mul_f32 v[40:41], v[161:162], v[40:41] v_pk_mul_f32 v[42:43], v[161:162], v[42:43] v_pk_mul_f32 v[68:69], v[161:162], v[68:69] v_pk_mul_f32 v[70:71], v[161:162], v[70:71] v_pk_mul_f32 v[64:65], v[161:162], v[64:65] v_pk_mul_f32 v[66:67], v[161:162], v[66:67] v_pk_mul_f32 v[60:61], v[161:162], v[60:61] v_pk_mul_f32 v[62:63], v[161:162], v[62:63] v_pk_mul_f32 v[56:57], v[161:162], v[56:57] v_pk_mul_f32 v[58:59], v[161:162], v[58:59] v_pk_mul_f32 v[84:85], v[161:162], v[84:85] v_pk_mul_f32 v[86:87], v[161:162], v[86:87] v_pk_mul_f32 v[80:81], v[161:162], v[80:81] v_pk_mul_f32 v[82:83], v[161:162], v[82:83] v_pk_mul_f32 v[76:77], v[161:162], v[76:77] v_pk_mul_f32 v[78:79], v[161:162], v[78:79] v_pk_mul_f32 v[72:73], v[161:162], v[72:73] v_pk_mul_f32 v[74:75], v[161:162], v[74:75] v_pk_mul_f32 v[96:97], v[161:162], v[96:97] v_pk_mul_f32 v[98:99], v[161:162], v[98:99] v_pk_mul_f32 v[92:93], v[161:162], v[92:93] v_pk_mul_f32 v[94:95], v[161:162], v[94:95] v_pk_mul_f32 v[88:89], v[161:162], v[88:89] v_pk_mul_f32 v[90:91], v[161:162], v[90:91] v_pk_mul_f32 v[36:37], v[161:162], v[36:37] v_pk_mul_f32 v[38:39], v[161:162], v[38:39] v_add_f32_e32 v221, v159, v103 v_fmac_f32_e32 v221, v161, v157 ; sched_barrier mask(0x00000000) v_cvt_pk_fp8_f32 v104, v104, v105, v100 v_cvt_pk_fp8_f32 v111, v111, v112, v101 v_add_u32_e32 v100, 0, v127 v_add_u32_e32 v100, 0xa000, v100 v_cvt_pk_fp8_f32 v106, v106, v107, v104 op_sel:[0,0,0,1] v_cvt_pk_fp8_f32 v158, v158, v103, v111 op_sel:[0,0,0,1] ds_write2_b32 v100, v106, v158 offset1:2 v_add_u32_e32 v100, 0, v126 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b128 v[103:106], v100 offset:40960 ; sched_barrier mask(0x00000000) ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[157:159:161:163], v128 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[158:160:162:164], v117 ds_read_m64x16_b8_alt4 v[165:167:169:171], v118 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[166:168:170:172], v119 ds_read_m64x16_b8_alt4 v[173:175:177:179], v141 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[174:176:178:180], v142 ds_read_m64x16_b8_alt4 v[181:183:185:187], v143 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[182:184:186:188], v144 ds_read_m64x16_b8_alt4 v[189:191:193:195], v145 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[190:192:194:196], v146 ds_read_m64x16_b8_alt4 v[197:199:201:203], v147 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[198:200:202:204], v148 ds_read_m64x16_b8_alt4 v[205:207:209:211], v149 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[206:208:210:212], v152 ds_read_m64x16_b8_alt4 v[213:215:217:219], v153 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[214:216:218:220], v154 ; sched_barrier mask(0x00000000) v_mov_b32_e32 v100, v157 v_mov_b32_e32 v101, v158 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[52:55], v[103:104], v[100:101], v[52:55] lit v_mov_b32_e32 v100, v159 v_mov_b32_e32 v101, v160 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[103:104], v[100:101], v[48:51] lit v_mov_b32_e32 v100, v161 v_mov_b32_e32 v101, v162 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[103:104], v[100:101], v[44:47] lit v_mov_b32_e32 v100, v163 v_mov_b32_e32 v101, v164 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[40:43], v[103:104], v[100:101], v[40:43] lit v_mov_b32_e32 v100, v165 v_mov_b32_e32 v101, v166 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[52:55], v[105:106], v[100:101], v[52:55] lit v_mov_b32_e32 v100, v167 v_mov_b32_e32 v101, v168 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[105:106], v[100:101], v[48:51] lit v_mov_b32_e32 v100, v169 v_mov_b32_e32 v101, v170 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[105:106], v[100:101], v[44:47] lit v_mov_b32_e32 v100, v171 v_mov_b32_e32 v101, v172 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[40:43], v[105:106], v[100:101], v[40:43] lit v_mov_b32_e32 v100, v173 v_mov_b32_e32 v101, v174 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[68:71], v[103:104], v[100:101], v[68:71] lit v_mov_b32_e32 v100, v175 v_mov_b32_e32 v101, v176 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[64:67], v[103:104], v[100:101], v[64:67] lit v_mov_b32_e32 v100, v177 v_mov_b32_e32 v101, v178 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[60:63], v[103:104], v[100:101], v[60:63] lit v_mov_b32_e32 v100, v179 v_mov_b32_e32 v101, v180 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[56:59], v[103:104], v[100:101], v[56:59] lit v_mov_b32_e32 v100, v181 v_mov_b32_e32 v101, v182 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[68:71], v[105:106], v[100:101], v[68:71] lit v_mov_b32_e32 v100, v183 v_mov_b32_e32 v101, v184 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[64:67], v[105:106], v[100:101], v[64:67] lit v_mov_b32_e32 v100, v185 v_mov_b32_e32 v101, v186 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[60:63], v[105:106], v[100:101], v[60:63] lit v_mov_b32_e32 v100, v187 v_mov_b32_e32 v101, v188 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[56:59], v[105:106], v[100:101], v[56:59] lit v_mov_b32_e32 v100, v189 v_mov_b32_e32 v101, v190 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[84:87], v[103:104], v[100:101], v[84:87] lit v_mov_b32_e32 v100, v191 v_mov_b32_e32 v101, v192 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[80:83], v[103:104], v[100:101], v[80:83] lit v_mov_b32_e32 v100, v193 v_mov_b32_e32 v101, v194 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[76:79], v[103:104], v[100:101], v[76:79] lit v_mov_b32_e32 v100, v195 v_mov_b32_e32 v101, v196 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[72:75], v[103:104], v[100:101], v[72:75] lit v_mov_b32_e32 v100, v197 v_mov_b32_e32 v101, v198 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[84:87], v[105:106], v[100:101], v[84:87] lit v_mov_b32_e32 v100, v199 v_mov_b32_e32 v101, v200 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[80:83], v[105:106], v[100:101], v[80:83] lit v_mov_b32_e32 v100, v201 v_mov_b32_e32 v101, v202 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[76:79], v[105:106], v[100:101], v[76:79] lit v_mov_b32_e32 v100, v203 v_mov_b32_e32 v101, v204 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[72:75], v[105:106], v[100:101], v[72:75] lit v_mov_b32_e32 v100, v205 v_mov_b32_e32 v101, v206 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[96:99], v[103:104], v[100:101], v[96:99] lit v_mov_b32_e32 v100, v207 v_mov_b32_e32 v101, v208 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[92:95], v[103:104], v[100:101], v[92:95] lit v_mov_b32_e32 v100, v209 v_mov_b32_e32 v101, v210 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[88:91], v[103:104], v[100:101], v[88:91] lit v_mov_b32_e32 v100, v211 v_mov_b32_e32 v101, v212 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[36:39], v[103:104], v[100:101], v[36:39] lit v_mov_b32_e32 v100, v213 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v101, v214 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[96:99], v[105:106], v[100:101], v[96:99] lit v_mov_b32_e32 v100, v215 v_mov_b32_e32 v101, v216 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[92:95], v[105:106], v[100:101], v[92:95] lit v_mov_b32_e32 v100, v217 v_mov_b32_e32 v101, v218 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[88:91], v[105:106], v[100:101], v[88:91] lit v_mov_b32_e32 v100, v219 v_mov_b32_e32 v101, v220 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[36:39], v[105:106], v[100:101], v[36:39] lit ;;#ASMSTART s_barrier ;;#ASMEND s_add_i32 s44, s11, -3 v_mov_b32_e32 v157, v221 v_mov_b32_e32 v111, v102 .LBB0_36: ; %Flow304 ; in Loop: Header=BB0_5 Depth=1 s_mov_b32 s40, s44 s_cmp_lt_i32 s40, s81 s_cbranch_scc0 .LBB0_41 .LBB0_37: ; in Loop: Header=BB0_5 Depth=1 v_mov_b32_e32 v100, v157 v_mov_b32_e32 v158, v111 .LBB0_38: ; %._crit_edge.i ; in Loop: Header=BB0_5 Depth=1 s_cmp_eq_u32 s81, 0 s_cselect_b64 s[26:27], -1, 0 s_add_i32 s40, s80, 63 s_ashr_i32 s41, s40, 31 s_lshr_b32 s41, s41, 26 s_add_i32 s40, s40, s41 s_ashr_i32 s40, s40, 6 s_cmp_eq_u32 s11, s40 s_cselect_b64 s[40:41], -1, 0 s_and_b64 s[26:27], s[26:27], s[40:41] s_andn2_b64 vcc, exec, s[26:27] s_mov_b64 s[26:27], -1 s_cbranch_vccnz .LBB0_52 ; %bb.39: ; %Flow296 ; in Loop: Header=BB0_5 Depth=1 s_and_b64 vcc, exec, s[26:27] s_cbranch_vccz .LBB0_4 s_branch .LBB0_61 .LBB0_40: ; in Loop: Header=BB0_5 Depth=1 s_mov_b32 s49, s48 s_mov_b32 s50, s48 s_mov_b32 s51, s48 v_mov_b64_e32 v[36:37], s[48:49] v_mov_b64_e32 v[38:39], s[50:51] v_mov_b64_e32 v[90:91], v[38:39] v_mov_b64_e32 v[94:95], v[38:39] v_mov_b64_e32 v[98:99], v[38:39] v_mov_b64_e32 v[74:75], v[38:39] v_mov_b64_e32 v[78:79], v[38:39] v_mov_b64_e32 v[82:83], v[38:39] v_mov_b64_e32 v[86:87], v[38:39] v_mov_b64_e32 v[58:59], v[38:39] v_mov_b64_e32 v[62:63], v[38:39] v_mov_b64_e32 v[66:67], v[38:39] v_mov_b64_e32 v[70:71], v[38:39] v_mov_b64_e32 v[42:43], v[38:39] v_mov_b64_e32 v[46:47], v[38:39] v_mov_b64_e32 v[50:51], v[38:39] v_mov_b64_e32 v[54:55], v[38:39] v_mov_b32_e32 v111, 0 v_mov_b64_e32 v[88:89], v[36:37] v_mov_b64_e32 v[92:93], v[36:37] v_mov_b64_e32 v[96:97], v[36:37] v_mov_b64_e32 v[72:73], v[36:37] v_mov_b64_e32 v[76:77], v[36:37] v_mov_b64_e32 v[80:81], v[36:37] v_mov_b64_e32 v[84:85], v[36:37] v_mov_b64_e32 v[56:57], v[36:37] v_mov_b64_e32 v[60:61], v[36:37] v_mov_b64_e32 v[64:65], v[36:37] v_mov_b64_e32 v[68:69], v[36:37] v_mov_b64_e32 v[40:41], v[36:37] v_mov_b64_e32 v[44:45], v[36:37] v_mov_b64_e32 v[48:49], v[36:37] v_mov_b64_e32 v[52:53], v[36:37] s_cmp_lt_i32 s40, s81 s_cbranch_scc1 .LBB0_37 .LBB0_41: ; %.lr.ph722.i ; in Loop: Header=BB0_5 Depth=1 s_add_i32 s54, s82, 0x2000 s_add_i32 s55, s82, 0x4000 s_add_i32 s90, s82, 0x6000 s_cmp_gt_i32 s86, 3 s_cselect_b64 s[26:27], -1, 0 s_lshl_b32 s41, s89, 10 s_add_i32 s89, s41, 0 s_mul_i32 s41, s84, s61 s_mul_hi_u32 s44, s84, s60 s_add_i32 s41, s44, s41 s_mul_i32 s85, s85, s60 s_add_i32 s86, s82, 0x8000 s_add_i32 s41, s41, s85 s_mul_i32 s84, s84, s60 s_add_u32 s84, s84, s79 s_addc_u32 s83, s41, s83 s_ashr_i32 s41, s40, 31 s_add_i32 s85, s40, 1 s_lshl_b64 s[40:41], s[40:41], 2 s_add_u32 s52, s88, s40 s_addc_u32 s53, s87, s41 ; implicit-def: $vgpr160 ; implicit-def: $vgpr159 .LBB0_42: ; Parent Loop BB0_5 Depth=1 ; => This Inner Loop Header: Depth=2 s_mov_b64 s[40:41], s[52:53] ;;#ASMSTART s_load_dword s44, s[40:41], 0x0 s_waitcnt lgkmcnt(0) ;;#ASMEND s_ashr_i32 s40, s44, 31 s_mul_i32 s41, s44, s59 s_mul_hi_u32 s45, s44, s58 s_add_i32 s41, s45, s41 s_mul_i32 s40, s40, s58 s_add_i32 s41, s41, s40 s_mul_i32 s44, s44, s58 s_add_u32 s44, s71, s44 s_addc_u32 s45, s72, s41 s_add_u32 s40, s44, s84 s_addc_u32 s41, s45, s83 s_nop 0 matrix_load_64x16_b8 s[40:43] s82 t r lds matrix_load_64x16_b8 s[40:43] s54 moffset:128 t r lds matrix_load_64x16_b8 s[40:43] s55 moffset:256 t r lds matrix_load_64x16_b8 s[40:43] s90 moffset:384 t r lds s_mov_b64 s[50:51], -1 s_and_b64 vcc, exec, s[26:27] s_cbranch_vccz .LBB0_44 ; %bb.43: ; in Loop: Header=BB0_42 Depth=2 v_readfirstlane_b32 s49, v123 s_ashr_i32 s50, s49, 31 s_lshr_b32 s50, s50, 30 s_add_i32 s50, s49, s50 s_and_b32 s51, s50, 0x3ffffc s_lshl_b32 s50, s50, 10 s_sub_i32 s49, s49, s51 s_and_b32 s50, s50, 0xfffff000 s_lshl_b32 s49, s49, 10 s_add_i32 s50, s50, 0 s_add_i32 s49, s50, s49 s_add_i32 s49, s49, 0x8000 v_mov_b32_e32 v100, -1 ;;#ASMSTART s_mov_b32 m0, s49 buffer_load_dwordx4 v100, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_mov_b64 s[50:51], 0 .LBB0_44: ; %Flow299 ; in Loop: Header=BB0_42 Depth=2 s_andn2_b64 vcc, exec, s[50:51] s_cbranch_vccnz .LBB0_46 ; %bb.45: ; in Loop: Header=BB0_42 Depth=2 matrix_load_64x16_b8 s[40:43] s86 moffset:512 t r lds .LBB0_46: ; in Loop: Header=BB0_42 Depth=2 ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND s_mov_b32 m0, s89 s_nop 0 ds_read_matrix_trans_format v[161:164], m0 element:1 row:3 col:1 s_mov_b32 s50, s48 s_mov_b32 s51, s48 s_mov_b32 s49, s48 v_mov_b64_e32 v[102:103], s[50:51] v_mov_b64_e32 v[100:101], s[48:49] v_mov_b64_e32 v[106:107], v[102:103] v_mov_b64_e32 v[104:105], v[100:101] ds_read_matrix_trans_format v[165:168], m0 offset:2048 element:1 row:3 col:1 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[0:1], v[161:162], v[104:107] lit ds_read_matrix_trans_format v[169:172], m0 offset:4096 element:1 row:3 col:1 v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[2:3], v[163:164], v[104:107] lit ds_read_matrix_trans_format v[161:164], m0 offset:6144 element:1 row:3 col:1 s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[0:1], v[165:166], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[2:3], v[167:168], v[100:103] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[4:5], v[169:170], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[6:7], v[171:172], v[104:107] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[4:5], v[161:162], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[6:7], v[163:164], v[100:103] lit ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[161:164], m0 offset:8192 element:1 row:3 col:1 ds_read_matrix_trans_format v[165:168], m0 offset:10240 element:1 row:3 col:1 ds_read_matrix_trans_format v[169:172], m0 offset:12288 element:1 row:3 col:1 ds_read_matrix_trans_format v[173:176], m0 offset:14336 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[8:9], v[161:162], v[104:107] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[8:9], v[165:166], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[10:11], v[163:164], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[10:11], v[167:168], v[100:103] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[12:13], v[169:170], v[104:107] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[12:13], v[173:174], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[14:15], v[171:172], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[14:15], v[175:176], v[100:103] lit ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[161:164], m0 offset:16384 element:1 row:3 col:1 ds_read_matrix_trans_format v[165:168], m0 offset:18432 element:1 row:3 col:1 ds_read_matrix_trans_format v[169:172], m0 offset:20480 element:1 row:3 col:1 ds_read_matrix_trans_format v[173:176], m0 offset:22528 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[16:17], v[161:162], v[104:107] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[16:17], v[165:166], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[18:19], v[163:164], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[18:19], v[167:168], v[100:103] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[20:21], v[169:170], v[104:107] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[20:21], v[173:174], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[22:23], v[171:172], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[22:23], v[175:176], v[100:103] lit ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[161:164], m0 offset:24576 element:1 row:3 col:1 ds_read_matrix_trans_format v[165:168], m0 offset:26624 element:1 row:3 col:1 ds_read_matrix_trans_format v[169:172], m0 offset:28672 element:1 row:3 col:1 ds_read_matrix_trans_format v[173:176], m0 offset:30720 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[24:25], v[161:162], v[104:107] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[24:25], v[165:166], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[26:27], v[163:164], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[26:27], v[167:168], v[100:103] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[28:29], v[169:170], v[104:107] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[28:29], v[173:174], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[30:31], v[171:172], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[30:31], v[175:176], v[100:103] lit ;;#ASMSTART s_waitcnt vmcnt(0) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[161:164], m0 offset:32768 element:1 row:3 col:1 ds_read_matrix_trans_format v[165:168], m0 offset:34816 element:1 row:3 col:1 v_mbcnt_hi_u32_b32 v112, -1, v155 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[32:33], v[161:162], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[34:35], v[163:164], v[104:107] lit v_and_b32_e32 v161, 63, v112 v_and_b32_e32 v163, 64, v112 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[32:33], v[165:166], v[100:103] lit v_max3_f32 v158, v111, v104, v105 v_xor_b32_e32 v162, 32, v161 v_add_u32_e32 v163, 64, v163 v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[34:35], v[167:168], v[100:103] lit v_max3_f32 v158, v158, v106, v107 v_cmp_lt_i32_e32 vcc, v162, v163 s_nop 0 v_max3_f32 v158, v158, v100, v101 v_cndmask_b32_e32 v162, v112, v162, vcc v_max3_f32 v158, v158, v102, v103 v_lshlrev_b32_e32 v162, 2, v162 ds_bpermute_b32 v162, v162, v158 v_xor_b32_e32 v161, 16, v161 v_cmp_lt_i32_e32 vcc, v161, v163 v_cndmask_b32_e32 v112, v112, v161, vcc v_lshlrev_b32_e32 v161, 2, v112 s_waitcnt lgkmcnt(0) v_max_f32_e32 v112, v158, v162 ds_bpermute_b32 v158, v161, v112 s_and_saveexec_b64 s[40:41], s[2:3] s_cbranch_execz .LBB0_48 ; %bb.47: ; in Loop: Header=BB0_42 Depth=2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v112, v112, v158 ds_write_b32 v115, v112 .LBB0_48: ; in Loop: Header=BB0_42 Depth=2 s_or_b64 exec, exec, s[40:41] s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[40:41], s[4:5] s_cbranch_execz .LBB0_50 ; %bb.49: ; in Loop: Header=BB0_42 Depth=2 ds_read_b64 v[161:162], v116 s_waitcnt lgkmcnt(0) v_max_f32_e32 v112, v161, v162 ds_write_b32 v125, v112 offset:512 .LBB0_50: ; %_ZN5flash7SoftmaxILi1EE25softmax_rescale_o_fp8_tp1ILb0ELb1ELb1EN4cute6TensorINS3_13array_alignedIfLm8ELm16EEENS3_6LayoutINS3_5tupleIJNS3_1CILi4EEENS9_ILi1EEENS9_ILi2EEEEEENS8_IJSB_NS9_ILi0EEESA_EEEEEEENS4_INS3_10ViewEngineINS3_8smem_ptrIfEEEENS7_INS8_IJNS9_ILi256EEEEEENS8_IJSB_EEEEEEEEEvRT2_RT3_fPDv4_f.exit1446.i ; in Loop: Header=BB0_42 Depth=2 s_or_b64 exec, exec, s[40:41] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v158, v125 offset:512 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, s78, v158 v_mul_f32_e32 v161, v158, v114 v_cndmask_b32_e64 v112, v158, 0, vcc v_cndmask_b32_e64 v161, v161, 0, vcc v_sub_f32_e32 v111, v111, v112 v_fma_f32 v104, v104, v114, -v161 v_mul_f32_e32 v111, v111, v114 v_exp_f32_e32 v225, v104 v_fma_f32 v104, v105, v114, -v161 v_exp_f32_e32 v111, v111 v_exp_f32_e32 v226, v104 v_fma_f32 v104, v106, v114, -v161 v_fma_f32 v100, v100, v114, -v161 v_exp_f32_e32 v106, v104 v_fma_f32 v104, v107, v114, -v161 v_exp_f32_e32 v227, v100 v_exp_f32_e32 v107, v104 v_fma_f32 v100, v101, v114, -v161 v_mov_b32_e32 v112, v111 v_exp_f32_e32 v228, v100 v_fma_f32 v100, v102, v114, -v161 v_pk_mul_f32 v[52:53], v[111:112], v[52:53] v_pk_mul_f32 v[54:55], v[111:112], v[54:55] v_pk_mul_f32 v[48:49], v[111:112], v[48:49] v_pk_mul_f32 v[50:51], v[111:112], v[50:51] v_pk_mul_f32 v[44:45], v[111:112], v[44:45] v_pk_mul_f32 v[46:47], v[111:112], v[46:47] v_pk_mul_f32 v[40:41], v[111:112], v[40:41] v_pk_mul_f32 v[42:43], v[111:112], v[42:43] v_pk_mul_f32 v[68:69], v[111:112], v[68:69] v_pk_mul_f32 v[70:71], v[111:112], v[70:71] v_pk_mul_f32 v[64:65], v[111:112], v[64:65] v_pk_mul_f32 v[66:67], v[111:112], v[66:67] v_pk_mul_f32 v[60:61], v[111:112], v[60:61] v_pk_mul_f32 v[62:63], v[111:112], v[62:63] v_pk_mul_f32 v[56:57], v[111:112], v[56:57] v_pk_mul_f32 v[58:59], v[111:112], v[58:59] v_pk_mul_f32 v[84:85], v[111:112], v[84:85] v_pk_mul_f32 v[86:87], v[111:112], v[86:87] v_pk_mul_f32 v[80:81], v[111:112], v[80:81] v_pk_mul_f32 v[82:83], v[111:112], v[82:83] v_pk_mul_f32 v[76:77], v[111:112], v[76:77] v_pk_mul_f32 v[78:79], v[111:112], v[78:79] v_pk_mul_f32 v[72:73], v[111:112], v[72:73] v_pk_mul_f32 v[74:75], v[111:112], v[74:75] v_pk_mul_f32 v[96:97], v[111:112], v[96:97] v_pk_mul_f32 v[98:99], v[111:112], v[98:99] v_pk_mul_f32 v[92:93], v[111:112], v[92:93] v_pk_mul_f32 v[94:95], v[111:112], v[94:95] v_pk_mul_f32 v[88:89], v[111:112], v[88:89] v_exp_f32_e32 v229, v100 v_fma_f32 v100, v103, v114, -v161 v_pk_mul_f32 v[90:91], v[111:112], v[90:91] v_pk_mul_f32 v[36:37], v[111:112], v[36:37] v_pk_mul_f32 v[38:39], v[111:112], v[38:39] v_exp_f32_e32 v230, v100 ; sched_barrier mask(0x00000000) v_mov_b32_e32 v100, v225 v_mov_b32_e32 v101, v227 v_cvt_pk_fp8_f32 v100, v100, v226, v160 v_cvt_pk_fp8_f32 v101, v101, v228, v159 v_mov_b32_e32 v160, v106 v_mov_b32_e32 v159, v229 v_cvt_pk_fp8_f32 v160, v160, v107, v100 op_sel:[0,0,0,1] v_cvt_pk_fp8_f32 v159, v159, v230, v101 op_sel:[0,0,0,1] v_add_u32_e32 v100, 0, v127 v_add_u32_e32 v100, 0xa000, v100 ds_write2_b32 v100, v160, v159 offset1:2 v_add_u32_e32 v100, 0, v126 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b128 v[100:103], v100 offset:40960 ; sched_barrier mask(0x00000000) ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[161:163:165:167], v128 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[162:164:166:168], v117 ds_read_m64x16_b8_alt4 v[169:171:173:175], v118 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[170:172:174:176], v119 ds_read_m64x16_b8_alt4 v[177:179:181:183], v141 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[178:180:182:184], v142 ds_read_m64x16_b8_alt4 v[185:187:189:191], v143 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[186:188:190:192], v144 ds_read_m64x16_b8_alt4 v[193:195:197:199], v145 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[194:196:198:200], v146 ds_read_m64x16_b8_alt4 v[201:203:205:207], v147 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[202:204:206:208], v148 ds_read_m64x16_b8_alt4 v[209:211:213:215], v149 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[210:212:214:216], v152 ds_read_m64x16_b8_alt4 v[217:219:221:223], v153 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[218:220:222:224], v154 ; sched_barrier mask(0x00000000) v_mov_b32_e32 v104, v161 v_mov_b32_e32 v105, v162 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[52:55], v[100:101], v[104:105], v[52:55] lit v_mov_b32_e32 v104, v163 v_mov_b32_e32 v105, v164 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[100:101], v[104:105], v[48:51] lit v_mov_b32_e32 v104, v165 v_mov_b32_e32 v105, v166 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[100:101], v[104:105], v[44:47] lit v_mov_b32_e32 v104, v167 v_mov_b32_e32 v105, v168 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[40:43], v[100:101], v[104:105], v[40:43] lit v_mov_b32_e32 v104, v169 v_mov_b32_e32 v105, v170 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[52:55], v[102:103], v[104:105], v[52:55] lit v_mov_b32_e32 v104, v171 v_mov_b32_e32 v105, v172 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[102:103], v[104:105], v[48:51] lit v_mov_b32_e32 v104, v173 v_mov_b32_e32 v105, v174 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[102:103], v[104:105], v[44:47] lit v_mov_b32_e32 v104, v175 v_mov_b32_e32 v105, v176 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[40:43], v[102:103], v[104:105], v[40:43] lit v_mov_b32_e32 v104, v177 v_mov_b32_e32 v105, v178 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[68:71], v[100:101], v[104:105], v[68:71] lit v_mov_b32_e32 v104, v179 v_mov_b32_e32 v105, v180 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[64:67], v[100:101], v[104:105], v[64:67] lit v_mov_b32_e32 v104, v181 v_mov_b32_e32 v105, v182 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[60:63], v[100:101], v[104:105], v[60:63] lit v_mov_b32_e32 v104, v183 v_mov_b32_e32 v105, v184 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[56:59], v[100:101], v[104:105], v[56:59] lit v_mov_b32_e32 v104, v185 v_mov_b32_e32 v105, v186 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[68:71], v[102:103], v[104:105], v[68:71] lit v_mov_b32_e32 v104, v187 v_mov_b32_e32 v105, v188 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[64:67], v[102:103], v[104:105], v[64:67] lit v_mov_b32_e32 v104, v189 v_mov_b32_e32 v105, v190 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[60:63], v[102:103], v[104:105], v[60:63] lit v_mov_b32_e32 v104, v191 v_mov_b32_e32 v105, v192 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[56:59], v[102:103], v[104:105], v[56:59] lit v_mov_b32_e32 v104, v193 v_mov_b32_e32 v105, v194 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[84:87], v[100:101], v[104:105], v[84:87] lit v_mov_b32_e32 v104, v195 v_mov_b32_e32 v105, v196 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[80:83], v[100:101], v[104:105], v[80:83] lit v_mov_b32_e32 v104, v197 v_mov_b32_e32 v105, v198 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[76:79], v[100:101], v[104:105], v[76:79] lit v_mov_b32_e32 v104, v199 v_mov_b32_e32 v105, v200 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[72:75], v[100:101], v[104:105], v[72:75] lit v_mov_b32_e32 v104, v201 v_mov_b32_e32 v105, v202 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[84:87], v[102:103], v[104:105], v[84:87] lit v_mov_b32_e32 v104, v203 v_mov_b32_e32 v105, v204 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[80:83], v[102:103], v[104:105], v[80:83] lit v_mov_b32_e32 v104, v205 v_mov_b32_e32 v105, v206 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[76:79], v[102:103], v[104:105], v[76:79] lit v_mov_b32_e32 v104, v207 v_mov_b32_e32 v105, v208 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[72:75], v[102:103], v[104:105], v[72:75] lit v_mov_b32_e32 v104, v209 v_mov_b32_e32 v105, v210 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[96:99], v[100:101], v[104:105], v[96:99] lit v_mov_b32_e32 v104, v211 v_mov_b32_e32 v105, v212 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[92:95], v[100:101], v[104:105], v[92:95] lit v_mov_b32_e32 v104, v213 v_mov_b32_e32 v105, v214 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[88:91], v[100:101], v[104:105], v[88:91] lit v_mov_b32_e32 v104, v215 v_mov_b32_e32 v105, v216 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[36:39], v[100:101], v[104:105], v[36:39] lit v_mov_b32_e32 v100, v217 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v101, v218 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[96:99], v[102:103], v[100:101], v[96:99] lit v_mov_b32_e32 v100, v219 v_mov_b32_e32 v101, v220 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[92:95], v[102:103], v[100:101], v[92:95] lit v_mov_b32_e32 v100, v221 v_mov_b32_e32 v101, v222 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[88:91], v[102:103], v[100:101], v[88:91] lit v_mov_b32_e32 v100, v223 v_mov_b32_e32 v101, v224 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[36:39], v[102:103], v[100:101], v[36:39] lit v_add_f32_e32 v100, v226, v225 v_add_f32_e32 v100, v100, v106 v_add_f32_e32 v100, v100, v107 v_add_f32_e32 v100, v100, v227 v_add_f32_e32 v100, v100, v228 v_add_f32_e32 v100, v100, v229 v_add_f32_e32 v100, v100, v230 v_fmac_f32_e32 v100, v111, v157 ;;#ASMSTART s_barrier ;;#ASMEND s_add_i32 s85, s85, -1 s_add_u32 s52, s52, -4 s_addc_u32 s53, s53, -1 s_cmp_gt_i32 s85, s81 s_cbranch_scc0 .LBB0_38 ; %bb.51: ; in Loop: Header=BB0_42 Depth=2 v_mov_b32_e32 v157, v100 v_mov_b32_e32 v111, v158 s_branch .LBB0_42 .LBB0_52: ; in Loop: Header=BB0_5 Depth=1 s_add_u32 s20, s24, s20 s_addc_u32 s21, s25, s21 global_load_dword v2, v108, s[20:21] v_mbcnt_hi_u32_b32 v0, -1, v155 v_and_b32_e32 v1, 63, v0 v_and_b32_e32 v4, 64, v0 v_xor_b32_e32 v3, 32, v1 v_add_u32_e32 v4, 64, v4 v_cmp_lt_i32_e32 vcc, v3, v4 v_cndmask_b32_e32 v3, v0, v3, vcc v_lshlrev_b32_e32 v3, 2, v3 ds_bpermute_b32 v3, v3, v100 v_xor_b32_e32 v1, 16, v1 v_cmp_lt_i32_e32 vcc, v1, v4 v_cndmask_b32_e32 v0, v0, v1, vcc v_lshlrev_b32_e32 v1, 2, v0 s_waitcnt lgkmcnt(0) v_add_f32_e32 v0, v3, v100 ds_bpermute_b32 v1, v1, v0 s_waitcnt vmcnt(0) v_readfirstlane_b32 s11, v2 s_and_saveexec_b64 s[20:21], s[2:3] s_cbranch_execz .LBB0_54 ; %bb.53: ; in Loop: Header=BB0_5 Depth=1 s_waitcnt lgkmcnt(0) v_add_f32_e32 v0, v0, v1 ds_write_b32 v120, v0 .LBB0_54: ; in Loop: Header=BB0_5 Depth=1 s_or_b64 exec, exec, s[20:21] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[20:21], s[4:5] s_cbranch_execz .LBB0_56 ; %bb.55: ; in Loop: Header=BB0_5 Depth=1 ds_read_b64 v[0:1], v121 s_waitcnt lgkmcnt(0) v_add_f32_e32 v0, v1, v0 ds_write_b32 v150, v0 offset:512 .LBB0_56: ; %_ZN5flash7SoftmaxILi1EE29normalize_softmax_lse_fp8_tp1ILb0ELb1ELb1EN4cute6TensorINS3_10ViewEngineINS3_8smem_ptrIfEEEENS3_6LayoutINS3_5tupleIJNS3_1CILi256EEEEEENSA_IJNSB_ILi1EEEEEEEEEEEENS4_INS3_13array_alignedIfLm1ELm16EEENS9_ISF_NSA_IJNSB_ILi0EEEEEEEEEEPDv4_fRT2_fff.exit.i ; in Loop: Header=BB0_5 Depth=1 s_or_b64 exec, exec, s[20:21] s_and_b64 s[8:9], s[8:9], exec s_cselect_b32 s8, s33, 0 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v0, v150 offset:512 s_add_i32 s8, s11, s8 s_mul_i32 s8, s8, s29 s_add_i32 s8, s8, s7 s_mul_i32 s8, s8, s35 s_add_i32 s20, s8, s6 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, 0, v0 s_and_saveexec_b64 s[26:27], s[66:67] s_cbranch_execz .LBB0_58 ; %bb.57: ; in Loop: Header=BB0_5 Depth=1 v_log_f32_e32 v1, v0 s_ashr_i32 s21, s20, 31 v_mov_b32_e32 v2, 0xff800000 s_lshl_b64 s[8:9], s[20:21], 2 v_mul_f32_e32 v1, 0x3f317218, v1 v_fmac_f32_e32 v1, v158, v113 v_cndmask_b32_e32 v3, v1, v2, vcc v_mov_b32_e32 v2, s9 v_add_co_u32_e64 v1, s[8:9], s8, v129 v_addc_co_u32_e64 v2, s[8:9], v130, v2, s[8:9] global_store_dword v[1:2], v3, off .LBB0_58: ; %.loopexit688.i ; in Loop: Header=BB0_5 Depth=1 s_or_b64 exec, exec, s[26:27] s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB0_60 ; %bb.59: ; %.preheader685.i ; in Loop: Header=BB0_5 Depth=1 v_rcp_f32_e32 v3, v0 v_mov_b32_e32 v111, v78 v_mov_b32_e32 v112, v74 v_mov_b32_e32 v2, v53 v_mul_f32_e32 v3, s34, v3 v_cndmask_b32_e64 v105, v3, 1.0, vcc v_mov_b32_e32 v106, v105 v_pk_mul_f32 v[161:162], v[105:106], v[111:112] v_mov_b32_e32 v111, v79 v_mov_b32_e32 v112, v75 v_pk_mul_f32 v[165:166], v[105:106], v[111:112] v_mov_b32_e32 v111, v96 v_mov_b32_e32 v112, v92 v_pk_mul_f32 v[167:168], v[105:106], v[111:112] v_mov_b32_e32 v111, v97 v_mov_b32_e32 v112, v93 v_pk_mul_f32 v[171:172], v[105:106], v[111:112] v_mov_b32_e32 v111, v98 v_mov_b32_e32 v112, v94 v_pk_mul_f32 v[175:176], v[105:106], v[111:112] v_mov_b32_e32 v111, v99 v_mov_b32_e32 v112, v95 v_mov_b32_e32 v3, v49 v_mov_b32_e32 v18, v69 v_mov_b32_e32 v19, v65 v_mov_b32_e32 v34, v85 v_mov_b32_e32 v35, v81 v_pk_mul_f32 v[179:180], v[105:106], v[111:112] v_mov_b32_e32 v111, v88 v_mov_b32_e32 v112, v36 v_pk_mul_f32 v[4:5], v[105:106], v[2:3] v_mov_b32_e32 v2, v54 v_mov_b32_e32 v3, v50 v_pk_mul_f32 v[20:21], v[105:106], v[18:19] v_mov_b32_e32 v18, v70 v_mov_b32_e32 v19, v66 v_pk_mul_f32 v[101:102], v[105:106], v[34:35] v_mov_b32_e32 v34, v86 v_mov_b32_e32 v35, v82 v_pk_mul_f32 v[169:170], v[105:106], v[111:112] v_mov_b32_e32 v111, v89 v_mov_b32_e32 v112, v37 v_pk_mul_f32 v[8:9], v[105:106], v[2:3] v_mov_b32_e32 v2, v55 v_mov_b32_e32 v3, v51 v_pk_mul_f32 v[24:25], v[105:106], v[18:19] v_mov_b32_e32 v18, v71 v_mov_b32_e32 v19, v67 v_pk_mul_f32 v[159:160], v[105:106], v[34:35] v_mov_b32_e32 v34, v87 v_mov_b32_e32 v35, v83 v_pk_mul_f32 v[173:174], v[105:106], v[111:112] v_mov_b32_e32 v111, v90 v_mov_b32_e32 v112, v38 v_mov_b32_e32 v0, v52 v_mov_b32_e32 v1, v48 v_pk_mul_f32 v[12:13], v[105:106], v[2:3] v_mov_b32_e32 v2, v44 v_mov_b32_e32 v3, v40 v_mov_b32_e32 v6, v45 v_mov_b32_e32 v7, v41 v_mov_b32_e32 v10, v46 v_mov_b32_e32 v11, v42 v_mov_b32_e32 v14, v47 v_mov_b32_e32 v15, v43 v_mov_b32_e32 v16, v68 v_mov_b32_e32 v17, v64 v_pk_mul_f32 v[28:29], v[105:106], v[18:19] v_mov_b32_e32 v18, v60 v_mov_b32_e32 v19, v56 v_mov_b32_e32 v22, v61 v_mov_b32_e32 v23, v57 v_mov_b32_e32 v26, v62 v_mov_b32_e32 v27, v58 v_mov_b32_e32 v30, v63 v_mov_b32_e32 v31, v59 v_mov_b32_e32 v32, v84 v_mov_b32_e32 v33, v80 v_pk_mul_f32 v[163:164], v[105:106], v[34:35] v_mov_b32_e32 v34, v76 v_mov_b32_e32 v35, v72 v_mov_b32_e32 v103, v77 v_mov_b32_e32 v104, v73 v_pk_mul_f32 v[177:178], v[105:106], v[111:112] v_mov_b32_e32 v111, v91 v_mov_b32_e32 v112, v39 s_mul_i32 s20, s20, s28 v_or_b32_e32 v107, s79, v122 v_pk_mul_f32 v[0:1], v[105:106], v[0:1] v_pk_mul_f32 v[2:3], v[105:106], v[2:3] v_pk_mul_f32 v[6:7], v[105:106], v[6:7] v_pk_mul_f32 v[10:11], v[105:106], v[10:11] v_pk_mul_f32 v[14:15], v[105:106], v[14:15] v_pk_mul_f32 v[16:17], v[105:106], v[16:17] v_pk_mul_f32 v[18:19], v[105:106], v[18:19] v_pk_mul_f32 v[22:23], v[105:106], v[22:23] v_pk_mul_f32 v[26:27], v[105:106], v[26:27] v_pk_mul_f32 v[30:31], v[105:106], v[30:31] v_pk_mul_f32 v[32:33], v[105:106], v[32:33] v_pk_mul_f32 v[34:35], v[105:106], v[34:35] v_pk_mul_f32 v[103:104], v[105:106], v[103:104] v_pk_mul_f32 v[181:182], v[105:106], v[111:112] s_ashr_i32 s21, s20, 31 v_ashrrev_i32_e32 v106, 31, v107 v_add_co_u32_e32 v105, vcc, v107, v151 s_lshl_b64 s[20:21], s[20:21], 2 v_addc_co_u32_e32 v106, vcc, 0, v106, vcc s_add_u32 s11, s64, s20 v_lshlrev_b64 v[105:106], 2, v[105:106] s_addc_u32 s20, s65, s21 v_mov_b32_e32 v111, s20 v_add_co_u32_e32 v105, vcc, s11, v105 v_addc_co_u32_e32 v106, vcc, v111, v106, vcc global_store_dwordx4 v[105:106], v[0:3], off global_store_dwordx4 v[105:106], v[4:7], off offset:16 global_store_dwordx4 v[105:106], v[8:11], off offset:32 global_store_dwordx4 v[105:106], v[12:15], off offset:48 v_add_u32_e32 v0, 0x80, v107 v_ashrrev_i32_e32 v1, 31, v0 v_add_co_u32_e32 v0, vcc, v0, v151 v_addc_co_u32_e32 v1, vcc, 0, v1, vcc v_lshlrev_b64 v[0:1], 2, v[0:1] v_mov_b32_e32 v2, s20 v_add_co_u32_e32 v0, vcc, s11, v0 v_addc_co_u32_e32 v1, vcc, v2, v1, vcc global_store_dwordx4 v[0:1], v[16:19], off global_store_dwordx4 v[0:1], v[20:23], off offset:16 global_store_dwordx4 v[0:1], v[24:27], off offset:32 global_store_dwordx4 v[0:1], v[28:31], off offset:48 v_add_u32_e32 v0, 0x100, v107 v_ashrrev_i32_e32 v1, 31, v0 v_add_co_u32_e32 v0, vcc, v0, v151 v_addc_co_u32_e32 v1, vcc, 0, v1, vcc v_lshlrev_b64 v[0:1], 2, v[0:1] v_add_co_u32_e32 v0, vcc, s11, v0 v_addc_co_u32_e32 v1, vcc, v2, v1, vcc global_store_dwordx4 v[0:1], v[32:35], off global_store_dwordx4 v[0:1], v[101:104], off offset:16 global_store_dwordx4 v[0:1], v[159:162], off offset:32 global_store_dwordx4 v[0:1], v[163:166], off offset:48 v_add_u32_e32 v0, 0x180, v107 v_ashrrev_i32_e32 v1, 31, v0 v_add_co_u32_e32 v0, vcc, v0, v151 v_addc_co_u32_e32 v1, vcc, 0, v1, vcc v_lshlrev_b64 v[0:1], 2, v[0:1] v_add_co_u32_e32 v0, vcc, s11, v0 v_addc_co_u32_e32 v1, vcc, v2, v1, vcc global_store_dwordx4 v[0:1], v[167:170], off global_store_dwordx4 v[0:1], v[171:174], off offset:16 global_store_dwordx4 v[0:1], v[175:178], off offset:32 global_store_dwordx4 v[0:1], v[179:182], off offset:48 .LBB0_60: ; %Flow ; in Loop: Header=BB0_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_mov_b64 s[26:27], 0 s_branch .LBB0_4 .LBB0_61: ; in Loop: Header=BB0_5 Depth=1 v_mbcnt_hi_u32_b32 v0, -1, v155 v_and_b32_e32 v1, 63, v0 v_and_b32_e32 v3, 64, v0 v_xor_b32_e32 v2, 32, v1 v_add_u32_e32 v3, 64, v3 v_cmp_lt_i32_e32 vcc, v2, v3 v_cndmask_b32_e32 v2, v0, v2, vcc v_lshlrev_b32_e32 v2, 2, v2 ds_bpermute_b32 v2, v2, v100 v_xor_b32_e32 v1, 16, v1 v_cmp_lt_i32_e32 vcc, v1, v3 v_cndmask_b32_e32 v0, v0, v1, vcc v_lshlrev_b32_e32 v1, 2, v0 s_waitcnt lgkmcnt(0) v_add_f32_e32 v0, v2, v100 ds_bpermute_b32 v1, v1, v0 s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execz .LBB0_63 ; %bb.62: ; in Loop: Header=BB0_5 Depth=1 s_waitcnt lgkmcnt(0) v_add_f32_e32 v0, v0, v1 ds_write_b32 v120, v0 .LBB0_63: ; in Loop: Header=BB0_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[8:9], s[4:5] s_cbranch_execz .LBB0_65 ; %bb.64: ; in Loop: Header=BB0_5 Depth=1 ds_read_b64 v[0:1], v121 s_waitcnt lgkmcnt(0) v_add_f32_e32 v0, v1, v0 ds_write_b32 v150, v0 offset:512 .LBB0_65: ; %_ZN5flash7SoftmaxILi1EE29normalize_softmax_lse_fp8_tp1ILb0ELb0ELb1EN4cute6TensorINS3_10ViewEngineINS3_8smem_ptrIfEEEENS3_6LayoutINS3_5tupleIJNS3_1CILi256EEEEEENSA_IJNSB_ILi1EEEEEEEEEEEENS4_INS3_13array_alignedIfLm1ELm16EEENS9_ISF_NSA_IJNSB_ILi0EEEEEEEEEEPDv4_fRT2_fff.exit.i ; in Loop: Header=BB0_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v0, v150 offset:512 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, 0, v0 s_and_saveexec_b64 s[20:21], s[66:67] s_cbranch_execz .LBB0_67 ; %bb.66: ; in Loop: Header=BB0_5 Depth=1 s_mul_i32 s8, s18, s29 v_log_f32_e32 v1, v0 s_add_i32 s8, s8, s7 s_mul_i32 s8, s8, s35 s_add_i32 s8, s8, s6 s_ashr_i32 s9, s8, 31 v_mul_f32_e32 v1, 0x3f317218, v1 v_fmac_f32_e32 v1, v158, v113 v_mov_b32_e32 v2, 0x7f800000 s_lshl_b64 s[8:9], s[8:9], 2 v_cndmask_b32_e32 v3, v1, v2, vcc v_mov_b32_e32 v2, s9 v_add_co_u32_e64 v1, s[8:9], s8, v131 v_addc_co_u32_e64 v2, s[8:9], v132, v2, s[8:9] global_store_dword v[1:2], v3, off .LBB0_67: ; %.loopexit.i ; in Loop: Header=BB0_5 Depth=1 s_or_b64 exec, exec, s[20:21] s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB0_3 ; %bb.68: ; %.preheader.i ; in Loop: Header=BB0_5 Depth=1 v_readlane_b32 s11, v231, 0 s_mul_i32 s11, s18, s11 s_mul_hi_u32 s20, s18, s30 v_rcp_f32_e32 v0, v0 s_add_i32 s11, s20, s11 s_mul_i32 s19, s19, s30 s_add_i32 s11, s11, s19 s_mul_i32 s19, s18, s30 v_readlane_b32 s20, v231, 1 s_add_u32 s20, s20, s19 s_addc_u32 s21, s76, s11 v_mul_f32_e32 v0, s34, v0 s_lshl_b64 s[20:21], s[20:21], 1 v_cndmask_b32_e64 v0, v0, 1.0, vcc s_add_u32 s11, s16, s20 v_mul_f32_e32 v1, v0, v52 v_mul_f32_e32 v3, v0, v48 s_addc_u32 s19, s17, s21 v_mul_f32_e32 v2, v0, v53 v_mul_f32_e32 v4, v0, v54 v_mul_f32_e32 v6, v0, v55 v_mul_f32_e32 v5, v0, v49 v_mul_f32_e32 v7, v0, v50 v_mul_f32_e32 v8, v0, v51 v_mul_f32_e32 v9, v0, v44 v_mul_f32_e32 v10, v0, v45 v_mul_f32_e32 v11, v0, v46 v_mul_f32_e32 v12, v0, v47 v_mul_f32_e32 v13, v0, v40 v_mul_f32_e32 v14, v0, v41 v_mul_f32_e32 v15, v0, v42 v_mul_f32_e32 v18, v0, v43 v_mul_f32_e32 v19, v0, v68 v_mul_f32_e32 v20, v0, v69 v_mul_f32_e32 v21, v0, v70 v_mul_f32_e32 v22, v0, v71 v_mul_f32_e32 v23, v0, v64 v_mul_f32_e32 v24, v0, v65 v_mul_f32_e32 v25, v0, v66 v_mul_f32_e32 v26, v0, v67 v_mul_f32_e32 v27, v0, v60 v_mul_f32_e32 v28, v0, v61 v_mul_f32_e32 v29, v0, v62 v_mul_f32_e32 v30, v0, v63 v_mul_f32_e32 v31, v0, v56 v_mul_f32_e32 v32, v0, v57 v_mul_f32_e32 v33, v0, v58 v_mul_f32_e32 v34, v0, v59 v_mul_f32_e32 v35, v0, v84 v_mul_f32_e32 v40, v0, v85 v_mul_f32_e32 v41, v0, v86 v_mul_f32_e32 v42, v0, v87 v_mul_f32_e32 v43, v0, v80 v_mul_f32_e32 v44, v0, v81 v_mul_f32_e32 v45, v0, v82 v_mul_f32_e32 v46, v0, v83 v_mul_f32_e32 v47, v0, v76 v_mul_f32_e32 v48, v0, v77 v_mul_f32_e32 v49, v0, v78 v_mul_f32_e32 v50, v0, v79 v_mul_f32_e32 v51, v0, v72 v_mul_f32_e32 v52, v0, v73 v_mul_f32_e32 v53, v0, v74 v_mul_f32_e32 v54, v0, v75 v_mul_f32_e32 v55, v0, v96 v_mul_f32_e32 v56, v0, v97 v_mul_f32_e32 v57, v0, v98 v_mul_f32_e32 v58, v0, v99 v_mul_f32_e32 v59, v0, v92 v_mul_f32_e32 v60, v0, v93 v_mul_f32_e32 v61, v0, v94 v_mul_f32_e32 v62, v0, v95 v_mul_f32_e32 v63, v0, v88 v_mul_f32_e32 v64, v0, v89 v_mul_f32_e32 v65, v0, v90 v_mul_f32_e32 v66, v0, v91 v_mul_f32_e32 v36, v0, v36 v_mul_f32_e32 v37, v0, v37 v_mul_f32_e32 v38, v0, v38 v_mul_f32_e32 v39, v0, v39 v_cvt_pk_bf16_f32 v0, v1, v3 v_mov_b32_e32 v3, s19 v_add_co_u32_e32 v16, vcc, s11, v109 v_cvt_pk_bf16_f32 v1, v9, v13 v_addc_co_u32_e32 v17, vcc, v3, v110, vcc v_cvt_pk_bf16_f32 v2, v2, v5 v_cvt_pk_bf16_f32 v3, v10, v14 v_cvt_pk_bf16_f32 v4, v4, v7 v_cvt_pk_bf16_f32 v5, v11, v15 v_cvt_pk_bf16_f32 v6, v6, v8 v_cvt_pk_bf16_f32 v7, v12, v18 v_cvt_pk_bf16_f32 v8, v19, v23 v_cvt_pk_bf16_f32 v9, v27, v31 v_cvt_pk_bf16_f32 v10, v20, v24 v_cvt_pk_bf16_f32 v11, v28, v32 v_cvt_pk_bf16_f32 v12, v21, v25 v_cvt_pk_bf16_f32 v13, v29, v33 v_cvt_pk_bf16_f32 v14, v22, v26 v_cvt_pk_bf16_f32 v15, v30, v34 global_store_dwordx4 v[16:17], v[0:3], off global_store_dwordx4 v[16:17], v[4:7], off offset:16 global_store_dwordx4 v[16:17], v[8:11], off offset:256 global_store_dwordx4 v[16:17], v[12:15], off offset:272 v_cvt_pk_bf16_f32 v0, v35, v43 v_cvt_pk_bf16_f32 v1, v47, v51 v_cvt_pk_bf16_f32 v2, v40, v44 v_cvt_pk_bf16_f32 v3, v48, v52 v_cvt_pk_bf16_f32 v4, v41, v45 v_cvt_pk_bf16_f32 v5, v49, v53 v_cvt_pk_bf16_f32 v6, v42, v46 v_cvt_pk_bf16_f32 v7, v50, v54 v_cvt_pk_bf16_f32 v8, v55, v59 v_cvt_pk_bf16_f32 v9, v63, v36 v_cvt_pk_bf16_f32 v10, v56, v60 v_cvt_pk_bf16_f32 v11, v64, v37 v_cvt_pk_bf16_f32 v12, v57, v61 v_cvt_pk_bf16_f32 v13, v65, v38 v_cvt_pk_bf16_f32 v14, v58, v62 v_cvt_pk_bf16_f32 v15, v66, v39 global_store_dwordx4 v[16:17], v[0:3], off offset:512 global_store_dwordx4 v[16:17], v[4:7], off offset:528 global_store_dwordx4 v[16:17], v[8:11], off offset:768 global_store_dwordx4 v[16:17], v[12:15], off offset:784 s_branch .LBB0_3 .LBB0_69: ; %.loopexit s_endpgm .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params .amdhsa_group_segment_fixed_size 0 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 336 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 1 .amdhsa_system_sgpr_workgroup_id_z 1 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 232 .amdhsa_next_free_sgpr 96 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_bf16_overflow 0 .amdhsa_bf8_overflow 1 .amdhsa_fp8_overflow 1 .amdhsa_bf16_denorm_mode 3 .amdhsa_load_store_out_of_order 0 .amdhsa_matrix_excp 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params,#alloc,#execinstr .Lfunc_end0: .size _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params, .Lfunc_end0-_ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 13680 ; NumSgprs: 100 ; NumVgprs: 232 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 0 bytes/workgroup (compile time only) ; SGPRBlocks: 12 ; VGPRBlocks: 57 ; NumSGPRsForWavesPerEU: 100 ; NumVGPRsForWavesPerEU: 232 ; Occupancy: 1 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF16_OVFL: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:FP8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:BF16_DENORM: 3 .section .text._ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params,#alloc,#execinstr .protected _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params ; -- Begin function _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params .globl _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params .p2align 8 .type _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params,@function _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params: ; @_ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params ; %bb.0: s_load_dwordx2 s[0:1], s[4:5], 0x110 s_load_dwordx2 s[34:35], s[4:5], 0x0 s_lshl_b32 s2, s8, 3 s_ashr_i32 s3, s2, 31 s_lshl_b64 s[2:3], s[2:3], 2 s_waitcnt lgkmcnt(0) s_add_u32 s44, s0, s2 s_addc_u32 s45, s1, s3 s_load_dwordx4 s[12:15], s[44:45], 0x0 s_waitcnt lgkmcnt(0) s_cmp_ge_i32 s12, s34 s_cbranch_scc1 .LBB1_59 ; %bb.1: s_cmp_gt_i32 s12, s14 s_cbranch_scc1 .LBB1_59 ; %bb.2: ; %.lr.ph s_load_dwordx2 s[56:57], s[4:5], 0x68 s_load_dwordx4 s[16:19], s[4:5], 0x58 s_load_dwordx2 s[58:59], s[4:5], 0x80 s_load_dwordx4 s[8:11], s[4:5], 0x90 s_load_dwordx2 s[60:61], s[4:5], 0xb0 s_load_dwordx4 s[40:43], s[4:5], 0xc0 s_load_dwordx2 s[36:37], s[4:5], 0xe0 s_load_dwordx4 s[20:23], s[4:5], 0xf0 s_load_dwordx4 s[0:3], s[4:5], 0x140 s_load_dwordx2 s[62:63], s[4:5], 0x100 s_load_dwordx4 s[24:27], s[4:5], 0x120 s_load_dwordx4 s[28:31], s[4:5], 0xc s_load_dwordx2 s[38:39], s[4:5], 0x20 s_load_dword s33, s[44:45], 0x10 s_load_dwordx2 s[64:65], s[4:5], 0x130 s_waitcnt lgkmcnt(0) s_load_dword s0, s[0:1], 0x0 s_nop 0 s_load_dword s34, s[2:3], 0x0 s_ashr_i32 s1, s13, 31 s_lshr_b32 s1, s1, 26 s_lshl_b32 s6, s6, 6 s_add_i32 s1, s13, s1 s_ashr_i32 s2, s6, 31 s_ashr_i32 s44, s7, 31 s_ashr_i32 s13, s1, 6 s_mul_i32 s1, s10, s2 s_mul_hi_u32 s3, s10, s6 s_mul_hi_u32 s31, s42, s7 s_mul_i32 s45, s42, s44 s_add_i32 s1, s3, s1 s_mul_i32 s3, s11, s6 s_add_i32 s31, s31, s45 s_mul_i32 s43, s43, s7 s_add_i32 s1, s1, s3 s_mul_i32 s3, s10, s6 s_add_i32 s31, s31, s43 s_mul_i32 s42, s42, s7 s_add_u32 s66, s42, s3 s_addc_u32 s67, s31, s1 s_ashr_i32 s1, s30, 31 s_add_i32 s3, s30, s1 s_xor_b32 s3, s3, s1 v_cvt_f32_u32_e32 v1, s3 s_mov_b32 s68, s8 s_sub_i32 s8, 0, s3 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v2, s0 v_rcp_iflag_f32_e32 v1, v1 s_xor_b32 s0, s44, s1 s_add_i32 s1, s7, s44 s_xor_b32 s1, s1, s44 v_mul_f32_e32 v1, 0x4f7ffffe, v1 v_cvt_u32_f32_e32 v1, v1 s_mov_b32 s69, s9 v_and_b32_e32 v7, 63, v0 v_and_b32_e32 v9, 60, v0 v_readfirstlane_b32 s30, v1 s_mul_i32 s8, s8, s30 s_mul_hi_u32 s8, s30, s8 s_add_i32 s30, s30, s8 s_mul_hi_u32 s8, s1, s30 s_mul_i32 s30, s8, s3 s_sub_i32 s1, s1, s30 s_add_i32 s30, s8, 1 s_sub_i32 s31, s1, s3 s_cmp_ge_u32 s1, s3 s_cselect_b32 s8, s30, s8 s_cselect_b32 s1, s31, s1 s_add_i32 s9, s8, 1 s_cmp_ge_u32 s1, s3 s_cselect_b32 s1, s9, s8 s_xor_b32 s1, s1, s0 s_sub_i32 s3, s1, s0 s_ashr_i32 s0, s3, 31 v_sub_u16_e32 v7, v7, v9 v_lshrrev_b16_e32 v9, 3, v0 s_mul_i32 s0, s36, s0 s_mul_hi_u32 s1, s36, s3 v_and_b32_e32 v9, 3, v9 s_add_i32 s8, s1, s0 s_load_dwordx2 s[0:1], s[4:5], 0x48 v_add_u16_e32 v7, v9, v7 v_mov_b32_e32 v9, 13 v_lshrrev_b16_sdwa v9, v9, sext(v7) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 v_and_b32_e32 v9, 3, v9 v_mul_f32_e32 v2, s34, v2 s_mul_i32 s9, s37, s3 v_lshrrev_b32_e32 v3, 2, v0 v_and_b32_e32 v4, 15, v0 v_lshrrev_b16_e32 v8, 2, v0 v_add_u16_e32 v9, v7, v9 v_mul_f32_e32 v111, s38, v2 v_mul_f32_e32 v112, s39, v2 s_add_i32 s8, s8, s9 s_mul_i32 s3, s36, s3 s_load_dwordx4 s[36:39], s[4:5], 0x28 v_lshrrev_b32_e32 v2, 8, v0 v_and_or_b32 v4, v3, 48, v4 v_and_b32_e32 v8, 15, v8 v_and_b32_e32 v9, 0xfc, v9 s_waitcnt lgkmcnt(0) s_add_u32 s70, s0, s3 s_movk_i32 s0, 0x200 v_lshlrev_b32_e32 v5, 1, v4 v_and_b32_e32 v8, 0xffff, v8 v_sub_u16_e32 v7, v7, v9 v_lshlrev_b32_e32 v9, 12, v2 s_addc_u32 s71, s1, s8 v_cmp_gt_u32_e32 vcc, s0, v0 v_add_lshl_u32 v5, v5, v2, 2 s_add_i32 s0, 0, 0xb400 v_lshlrev_b32_e32 v6, 3, v4 v_bfe_i32 v7, v7, 0, 8 v_lshl_or_b32 v8, v8, 6, v9 v_add_u32_e32 v113, s0, v5 v_add_u32_e32 v114, s0, v6 v_lshl_add_u32 v7, v7, 4, v8 s_add_i32 s0, 0, 0x400 v_add_u32_e32 v115, s0, v7 s_add_i32 s0, 0, 0x800 v_add_u32_e32 v116, s0, v7 s_add_i32 s0, 0, 0xc00 v_add_u32_e32 v1, -2, v2 v_add_u32_e32 v117, s0, v7 s_add_i32 s0, 0, 0xb000 v_cndmask_b32_e32 v1, v1, v2, vcc v_add_u32_e32 v118, s0, v5 v_bfe_u32 v5, v0, 4, 2 v_add_u32_e32 v119, s0, v6 v_lshl_or_b32 v5, v1, 6, v5 s_sub_i32 s0, s35, s6 s_mul_i32 s2, s40, s2 s_mul_hi_u32 s3, s40, s6 s_mul_hi_u32 s4, s20, s7 s_mul_i32 s5, s20, s44 v_cmp_eq_u32_e32 vcc, 0, v5 v_cmp_gt_i32_e64 s[0:1], s0, v4 s_add_i32 s2, s3, s2 s_mul_i32 s3, s41, s6 s_add_i32 s4, s4, s5 s_mul_i32 s5, s21, s7 s_sub_i32 s72, s6, s35 s_and_b64 s[30:31], vcc, s[0:1] s_add_i32 s2, s2, s3 s_mul_i32 s3, s40, s6 s_add_i32 s4, s4, s5 s_mul_i32 s5, s20, s7 s_add_u32 s73, s5, s3 v_and_b32_e32 v120, 48, v0 s_addc_u32 s75, s4, s2 s_movk_i32 s2, 0xc0 v_and_or_b32 v107, v3, s2, v120 v_and_b32_e32 v3, 12, v3 v_mov_b32_e32 v108, 0 v_lshrrev_b32_e32 v121, 6, v0 v_lshl_or_b32 v122, v1, 4, v3 v_and_b32_e32 v3, 0x330, v0 v_lshlrev_b32_e32 v6, 4, v0 v_mad_u64_u32 v[0:1], s[2:3], s40, v4, v[107:108] v_and_b32_e32 v124, 0xff0, v6 v_lshl_or_b32 v125, v2, 2, v124 v_mad_u64_u32 v[1:2], s[2:3], s41, v4, v[1:2] v_lshlrev_b32_e32 v5, 2, v4 v_mov_b32_e32 v2, s27 v_add_co_u32_e32 v127, vcc, s26, v5 v_addc_co_u32_e32 v128, vcc, 0, v2, vcc v_add_u32_e32 v126, 0, v7 v_mov_b32_e32 v2, s19 v_add_co_u32_e32 v129, vcc, s18, v5 v_mbcnt_lo_u32_b32 v152, -1, 0 v_lshlrev_b64 v[109:110], 1, v[0:1] s_mov_b32 s74, s10 s_mov_b32 s76, s11 v_sub_u32_e32 v123, v114, v5 v_addc_co_u32_e32 v130, vcc, 0, v2, vcc s_mov_b32 s42, s60 v_or_b32_e32 v131, 1, v122 v_or_b32_e32 v132, 2, v122 v_or_b32_e32 v133, 3, v122 v_add_u32_e32 v134, 32, v122 v_add_u32_e32 v135, 33, v122 v_add_u32_e32 v136, 34, v122 v_add_u32_e32 v137, 35, v122 v_add_u32_e32 v138, 0x2000, v126 v_add_u32_e32 v139, 0x2000, v115 v_add_u32_e32 v140, 0x2000, v116 v_add_u32_e32 v141, 0x2000, v117 v_add_u32_e32 v142, 0x4000, v126 v_add_u32_e32 v143, 0x4000, v115 v_add_u32_e32 v144, 0x4000, v116 v_add_u32_e32 v145, 0x4000, v117 v_add_u32_e32 v146, 0x6000, v126 v_add_u32_e32 v147, 0x6000, v115 v_sub_u32_e32 v148, v119, v5 v_lshlrev_b32_e32 v149, 9, v4 v_add_u32_e32 v150, 0x6000, v116 v_add_u32_e32 v151, 0x6000, v117 v_mbcnt_hi_u32_b32 v153, -1, v152 s_brev_b32 s46, 1 s_mov_b32 s47, 0x20000 s_mov_b32 s43, 0x10000 s_mov_b32 s77, 0xff800000 v_cmp_eq_u32_e64 s[2:3], 0, v120 v_cmp_eq_u32_e64 s[4:5], 0, v3 s_mov_b32 s18, s12 s_mov_b32 s48, 0 s_branch .LBB1_5 .LBB1_3: ; %Flow278 ; in Loop: Header=BB1_5 Depth=1 s_or_b64 exec, exec, s[8:9] .LBB1_4: ; %_ZN5flash49compute_attn_1rowblock_splitkv_mla_fp8_gfx938_TP1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEvRK20Flash_fwd_mla_paramsiiiiiiibRT1_fff.exit ; in Loop: Header=BB1_5 Depth=1 s_add_i32 s8, s18, 1 s_cmp_lt_i32 s18, s14 s_mov_b32 s18, s8 s_cbranch_scc0 .LBB1_59 .LBB1_5: ; =>This Loop Header: Depth=1 ; Child Loop BB1_32 Depth 2 s_ashr_i32 s19, s18, 31 s_lshl_b64 s[20:21], s[18:19], 2 s_add_u32 s8, s36, s20 s_addc_u32 s9, s37, s21 global_load_dword v0, v108, s[8:9] s_cmp_le_i32 s18, s12 s_waitcnt vmcnt(0) v_readfirstlane_b32 s79, v0 s_cbranch_scc1 .LBB1_7 ; %bb.6: ; in Loop: Header=BB1_5 Depth=1 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) .LBB1_7: ; in Loop: Header=BB1_5 Depth=1 s_mul_i32 s8, s18, s57 s_mul_hi_u32 s9, s18, s56 s_add_i32 s8, s9, s8 s_mul_i32 s9, s19, s56 s_add_i32 s8, s8, s9 s_mul_i32 s9, s18, s56 s_add_u32 s9, s66, s9 s_addc_u32 s8, s67, s8 v_readfirstlane_b32 s83, v121 s_add_u32 s44, s38, s9 s_addc_u32 s45, s39, s8 s_ashr_i32 s8, s83, 31 s_lshr_b32 s8, s8, 30 s_add_i32 s8, s83, s8 s_ashr_i32 s84, s8, 2 s_and_b32 s8, s8, -4 s_sub_i32 s8, s83, s8 s_lshl_b32 s78, s84, 6 s_lshl_b32 s86, s8, 4 s_lshl_b32 s9, s84, 12 s_ashr_i32 s85, s78, 31 s_ashr_i32 s87, s86, 31 s_add_i32 s82, s9, 0 s_cmp_lg_u32 s82, -1 s_cselect_b32 s9, s82, 0 s_lshl_b32 s40, s8, 10 s_add_u32 s81, s9, s40 s_bitset1_b32 s81, 31 s_add_i32 s8, s86, s6 s_cmp_ge_i32 s8, s35 s_mov_b64 s[8:9], -1 s_cbranch_scc0 .LBB1_9 ; %bb.8: ; in Loop: Header=BB1_5 Depth=1 v_readfirstlane_b32 s8, v121 s_ashr_i32 s9, s8, 31 s_lshr_b32 s9, s9, 30 s_add_i32 s9, s8, s9 s_and_b32 s11, s9, 0x3ffffc s_sub_i32 s8, s8, s11 s_lshl_b32 s9, s9, 10 s_lshl_b32 s8, s8, 10 s_and_b32 s9, s9, 0xfffff000 s_add_i32 s8, s8, s9 s_add_i32 s8, s8, 0 v_mov_b32_e32 v0, -1 ;;#ASMSTART s_mov_b32 m0, s8 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s9, s8, 0x2000 ;;#ASMSTART s_mov_b32 m0, s9 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s9, s8, 0x4000 ;;#ASMSTART s_mov_b32 m0, s9 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s9, s8, 0x6000 ;;#ASMSTART s_mov_b32 m0, s9 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s8, s8, 0x8000 ;;#ASMSTART s_mov_b32 m0, s8 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_mov_b64 s[8:9], 0 .LBB1_9: ; %Flow295 ; in Loop: Header=BB1_5 Depth=1 s_andn2_b64 vcc, exec, s[8:9] s_add_i32 s52, s86, 16 s_cbranch_vccnz .LBB1_14 ; %bb.10: ; in Loop: Header=BB1_5 Depth=1 s_mul_i32 s8, s86, s76 s_mul_hi_u32 s9, s86, s74 s_add_i32 s8, s9, s8 s_mul_i32 s9, s87, s74 s_add_i32 s8, s8, s9 s_mul_i32 s9, s86, s74 s_add_u32 s9, s9, s78 s_addc_u32 s11, s8, s85 s_add_i32 s8, s52, s72 s_max_i32 s26, s8, 0 s_add_u32 s8, s44, s9 s_addc_u32 s9, s45, s11 s_lshl_b32 s11, s26, 8 s_bitset1_b32 s11, 16 s_nop 0 matrix_load_64x16_b8 s[8:11] s81 t r lds s_add_i32 s26, s81, 0x2000 s_nop 0 matrix_load_64x16_b8 s[8:11] s26 moffset:128 t r lds s_add_i32 s26, s81, 0x4000 s_nop 0 matrix_load_64x16_b8 s[8:11] s26 moffset:256 t r lds s_add_i32 s26, s81, 0x6000 s_nop 0 matrix_load_64x16_b8 s[8:11] s26 moffset:384 t r lds s_cmp_gt_i32 s83, 3 s_mov_b64 s[26:27], -1 s_cbranch_scc0 .LBB1_12 ; %bb.11: ; in Loop: Header=BB1_5 Depth=1 v_readfirstlane_b32 s26, v121 s_ashr_i32 s27, s26, 31 s_lshr_b32 s27, s27, 30 s_add_i32 s27, s26, s27 s_and_b32 s41, s27, 0x3ffffc s_lshl_b32 s27, s27, 10 s_sub_i32 s26, s26, s41 s_and_b32 s27, s27, 0xfffff000 s_lshl_b32 s26, s26, 10 s_add_i32 s27, s27, 0 s_add_i32 s26, s27, s26 s_add_i32 s26, s26, 0x8000 v_mov_b32_e32 v0, -1 ;;#ASMSTART s_mov_b32 m0, s26 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_mov_b64 s[26:27], 0 .LBB1_12: ; %Flow292 ; in Loop: Header=BB1_5 Depth=1 s_andn2_b64 vcc, exec, s[26:27] s_cbranch_vccnz .LBB1_14 ; %bb.13: ; in Loop: Header=BB1_5 Depth=1 s_add_i32 s26, s81, 0x8000 s_nop 0 matrix_load_64x16_b8 s[8:11] s26 moffset:512 t r lds .LBB1_14: ; %.preheader683.i ; in Loop: Header=BB1_5 Depth=1 s_cmp_eq_u32 s18, s12 s_cselect_b64 s[8:9], -1, 0 s_and_b64 s[26:27], s[8:9], exec s_cselect_b32 s80, s13, 0 s_cmp_eq_u32 s18, s14 s_cselect_b32 s11, s15, s79 s_add_i32 s11, s11, 63 s_ashr_i32 s26, s11, 31 s_lshr_b32 s26, s26, 26 s_add_i32 s11, s11, s26 s_ashr_i32 s11, s11, 6 s_add_i32 s26, s40, 0 ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND s_mov_b32 m0, s26 s_nop 0 ds_read_matrix_trans_format v[4:7], m0 element:1 row:3 col:1 ds_read_matrix_trans_format v[8:11], m0 offset:4096 element:1 row:3 col:1 ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[12:15], m0 offset:8192 element:1 row:3 col:1 ds_read_matrix_trans_format v[16:19], m0 offset:12288 element:1 row:3 col:1 ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[20:23], m0 offset:16384 element:1 row:3 col:1 ds_read_matrix_trans_format v[24:27], m0 offset:20480 element:1 row:3 col:1 ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[28:31], m0 offset:24576 element:1 row:3 col:1 ds_read_matrix_trans_format v[32:35], m0 offset:28672 element:1 row:3 col:1 ;;#ASMSTART s_waitcnt vmcnt(0) s_barrier ;;#ASMEND s_mul_i32 s26, s18, s63 s_mul_hi_u32 s27, s18, s62 s_add_i32 s26, s27, s26 s_mul_i32 s27, s19, s62 ds_read_matrix_trans_format v[36:39], m0 offset:32768 element:1 row:3 col:1 s_add_i32 s27, s26, s27 s_mul_i32 s26, s18, s62 s_lshl_b64 s[26:27], s[26:27], 2 s_add_u32 s89, s22, s26 s_addc_u32 s88, s23, s27 s_add_i32 s40, s11, -1 s_cmp_le_i32 s11, s80 v_mov_b32_e32 v154, 0 s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_cbranch_scc1 .LBB1_30 ; %bb.15: ; %.lr.ph.i ; in Loop: Header=BB1_5 Depth=1 s_ashr_i32 s41, s40, 31 s_lshl_b64 s[26:27], s[40:41], 2 s_add_u32 s26, s89, s26 s_addc_u32 s27, s88, s27 ;;#ASMSTART s_load_dword s41, s[26:27], 0x0 s_waitcnt lgkmcnt(0) ;;#ASMEND s_ashr_i32 s26, s41, 31 s_mul_i32 s27, s41, s59 s_mul_hi_u32 s44, s41, s58 s_add_i32 s27, s44, s27 s_mul_i32 s26, s26, s58 s_add_i32 s27, s27, s26 s_mul_i32 s41, s41, s58 s_add_u32 s44, s70, s41 s_addc_u32 s45, s71, s27 s_lshl_b32 s40, s40, 6 s_add_i32 s26, s86, s40 s_cmp_ge_i32 s26, s79 s_mov_b64 s[26:27], -1 s_cbranch_scc0 .LBB1_17 ; %bb.16: ; in Loop: Header=BB1_5 Depth=1 v_readfirstlane_b32 s26, v121 s_ashr_i32 s27, s26, 31 s_lshr_b32 s27, s27, 30 s_add_i32 s27, s26, s27 s_and_b32 s41, s27, 0x3ffffc s_sub_i32 s26, s26, s41 s_lshl_b32 s27, s27, 10 s_lshl_b32 s26, s26, 10 s_and_b32 s27, s27, 0xfffff000 s_add_i32 s26, s26, s27 s_add_i32 s26, s26, 0 v_mov_b32_e32 v0, -1 ;;#ASMSTART s_mov_b32 m0, s26 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s27, s26, 0x2000 ;;#ASMSTART s_mov_b32 m0, s27 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s27, s26, 0x4000 ;;#ASMSTART s_mov_b32 m0, s27 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s27, s26, 0x6000 ;;#ASMSTART s_mov_b32 m0, s27 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s26, s26, 0x8000 ;;#ASMSTART s_mov_b32 m0, s26 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_mov_b64 s[26:27], 0 .LBB1_17: ; %Flow289 ; in Loop: Header=BB1_5 Depth=1 s_andn2_b64 vcc, exec, s[26:27] s_cbranch_vccnz .LBB1_22 ; %bb.18: ; in Loop: Header=BB1_5 Depth=1 s_mul_i32 s26, s86, s61 s_mul_hi_u32 s27, s86, s60 s_add_i32 s26, s27, s26 s_mul_i32 s27, s87, s60 s_add_i32 s26, s26, s27 s_mul_i32 s27, s86, s60 s_add_u32 s27, s27, s78 s_addc_u32 s26, s26, s85 s_sub_i32 s41, s40, s79 s_add_i32 s41, s41, s52 s_max_i32 s41, s41, 0 s_add_u32 s52, s44, s27 s_addc_u32 s53, s45, s26 s_lshl_b32 s26, s41, 8 s_or_b32 s55, s26, 0x10000 s_mov_b32 s54, s42 s_add_i32 s41, s81, 0x2000 matrix_load_64x16_b8 s[52:55] s81 t r lds s_add_i32 s27, s81, 0x4000 matrix_load_64x16_b8 s[52:55] s41 moffset:128 t r lds s_add_i32 s26, s81, 0x6000 matrix_load_64x16_b8 s[52:55] s27 moffset:256 t r lds matrix_load_64x16_b8 s[52:55] s26 moffset:384 t r lds s_cmp_gt_i32 s83, 3 s_mov_b64 s[26:27], -1 s_cbranch_scc0 .LBB1_20 ; %bb.19: ; in Loop: Header=BB1_5 Depth=1 v_readfirstlane_b32 s26, v121 s_ashr_i32 s27, s26, 31 s_lshr_b32 s27, s27, 30 s_add_i32 s27, s26, s27 s_and_b32 s41, s27, 0x3ffffc s_lshl_b32 s27, s27, 10 s_sub_i32 s26, s26, s41 s_and_b32 s27, s27, 0xfffff000 s_lshl_b32 s26, s26, 10 s_add_i32 s27, s27, 0 s_add_i32 s26, s27, s26 s_add_i32 s26, s26, 0x8000 v_mov_b32_e32 v0, -1 ;;#ASMSTART s_mov_b32 m0, s26 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_mov_b64 s[26:27], 0 .LBB1_20: ; %Flow286 ; in Loop: Header=BB1_5 Depth=1 s_andn2_b64 vcc, exec, s[26:27] s_cbranch_vccnz .LBB1_22 ; %bb.21: ; in Loop: Header=BB1_5 Depth=1 s_add_i32 s26, s81, 0x8000 s_nop 0 matrix_load_64x16_b8 s[52:55] s26 moffset:512 t r lds .LBB1_22: ; in Loop: Header=BB1_5 Depth=1 s_mul_i32 s26, s84, 0xfffff400 s_add_i32 s82, s82, s26 ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND s_mov_b32 m0, s82 s_nop 0 ds_read_matrix_trans_format v[0:3], m0 element:1 row:3 col:1 s_mov_b32 s49, s48 s_mov_b32 s50, s48 s_mov_b32 s51, s48 v_mov_b64_e32 v[44:45], s[48:49] v_mov_b64_e32 v[46:47], s[50:51] v_mov_b64_e32 v[50:51], v[46:47] v_mov_b64_e32 v[48:49], v[44:45] ds_read_matrix_trans_format v[40:43], m0 offset:2048 element:1 row:3 col:1 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[4:5], v[0:1], v[48:51] lit ds_read_matrix_trans_format v[52:55], m0 offset:4096 element:1 row:3 col:1 v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[6:7], v[2:3], v[48:51] lit ds_read_matrix_trans_format v[0:3], m0 offset:6144 element:1 row:3 col:1 s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[4:5], v[40:41], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[6:7], v[42:43], v[44:47] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[8:9], v[52:53], v[48:51] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[10:11], v[54:55], v[48:51] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[8:9], v[0:1], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[10:11], v[2:3], v[44:47] lit ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[0:3], m0 offset:8192 element:1 row:3 col:1 ds_read_matrix_trans_format v[40:43], m0 offset:10240 element:1 row:3 col:1 ds_read_matrix_trans_format v[52:55], m0 offset:12288 element:1 row:3 col:1 ds_read_matrix_trans_format v[56:59], m0 offset:14336 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[12:13], v[0:1], v[48:51] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[14:15], v[2:3], v[48:51] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[12:13], v[40:41], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[14:15], v[42:43], v[44:47] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[16:17], v[52:53], v[48:51] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[18:19], v[54:55], v[48:51] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[16:17], v[56:57], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[18:19], v[58:59], v[44:47] lit ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[0:3], m0 offset:16384 element:1 row:3 col:1 ds_read_matrix_trans_format v[40:43], m0 offset:18432 element:1 row:3 col:1 ds_read_matrix_trans_format v[52:55], m0 offset:20480 element:1 row:3 col:1 ds_read_matrix_trans_format v[56:59], m0 offset:22528 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[20:21], v[0:1], v[48:51] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[22:23], v[2:3], v[48:51] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[20:21], v[40:41], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[22:23], v[42:43], v[44:47] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[24:25], v[52:53], v[48:51] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[26:27], v[54:55], v[48:51] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[24:25], v[56:57], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[26:27], v[58:59], v[44:47] lit ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[0:3], m0 offset:24576 element:1 row:3 col:1 ds_read_matrix_trans_format v[40:43], m0 offset:26624 element:1 row:3 col:1 ds_read_matrix_trans_format v[52:55], m0 offset:28672 element:1 row:3 col:1 ds_read_matrix_trans_format v[56:59], m0 offset:30720 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[28:29], v[0:1], v[48:51] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[30:31], v[2:3], v[48:51] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[28:29], v[40:41], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[30:31], v[42:43], v[44:47] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[32:33], v[52:53], v[48:51] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[34:35], v[54:55], v[48:51] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[32:33], v[56:57], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[34:35], v[58:59], v[44:47] lit ;;#ASMSTART s_waitcnt vmcnt(0) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[0:3], m0 offset:32768 element:1 row:3 col:1 ds_read_matrix_trans_format v[40:43], m0 offset:34816 element:1 row:3 col:1 s_sub_i32 s26, s79, s40 v_cmp_gt_i32_e32 vcc, s26, v122 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[36:37], v[0:1], v[48:51] lit v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[38:39], v[2:3], v[48:51] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[36:37], v[40:41], v[44:47] lit v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[38:39], v[42:43], v[44:47] lit v_mov_b32_e32 v42, 0xff800000 s_nop 0 v_cndmask_b32_e32 v43, v42, v48, vcc v_cmp_gt_i32_e32 vcc, s26, v131 v_cndmask_b32_e32 v0, v42, v49, vcc v_cmp_gt_i32_e32 vcc, s26, v132 v_cndmask_b32_e32 v1, v42, v50, vcc v_cmp_gt_i32_e32 vcc, s26, v133 v_cndmask_b32_e32 v2, v42, v51, vcc v_cmp_gt_i32_e32 vcc, s26, v134 v_cndmask_b32_e32 v3, v42, v44, vcc v_cmp_gt_i32_e32 vcc, s26, v135 v_cndmask_b32_e32 v40, v42, v45, vcc v_cmp_gt_i32_e32 vcc, s26, v136 v_cndmask_b32_e32 v41, v42, v46, vcc v_cmp_gt_i32_e32 vcc, s26, v137 v_cndmask_b32_e32 v42, v42, v47, vcc v_and_b32_e32 v45, 63, v153 v_and_b32_e32 v47, 64, v153 v_max_f32_e32 v44, v43, v0 v_xor_b32_e32 v46, 32, v45 v_add_u32_e32 v47, 64, v47 v_max3_f32 v44, v44, v1, v2 v_cmp_lt_i32_e32 vcc, v46, v47 v_max3_f32 v44, v44, v3, v40 v_cndmask_b32_e32 v46, v153, v46, vcc v_max3_f32 v44, v44, v41, v42 v_lshlrev_b32_e32 v46, 2, v46 ds_bpermute_b32 v46, v46, v44 v_xor_b32_e32 v45, 16, v45 v_cmp_lt_i32_e32 vcc, v45, v47 v_cndmask_b32_e32 v45, v153, v45, vcc v_lshlrev_b32_e32 v45, 2, v45 s_waitcnt lgkmcnt(0) v_max_f32_e32 v44, v44, v46 ds_bpermute_b32 v45, v45, v44 s_and_saveexec_b64 s[26:27], s[2:3] s_cbranch_execz .LBB1_24 ; %bb.23: ; in Loop: Header=BB1_5 Depth=1 s_waitcnt lgkmcnt(0) v_max_f32_e32 v44, v44, v45 ds_write_b32 v113, v44 .LBB1_24: ; in Loop: Header=BB1_5 Depth=1 s_or_b64 exec, exec, s[26:27] s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[26:27], s[4:5] s_cbranch_execz .LBB1_26 ; %bb.25: ; in Loop: Header=BB1_5 Depth=1 ds_read_b64 v[44:45], v114 s_waitcnt lgkmcnt(0) v_max_f32_e32 v44, v44, v45 ds_write_b32 v123, v44 offset:512 .LBB1_26: ; %_ZN5flash7SoftmaxILi1EE25softmax_rescale_o_fp8_tp1ILb1ELb0ELb1EN4cute6TensorINS3_13array_alignedIfLm8ELm16EEENS3_6LayoutINS3_5tupleIJNS3_1CILi4EEENS9_ILi1EEENS9_ILi2EEEEEENS8_IJSB_NS9_ILi0EEESA_EEEEEEENS4_INS3_10ViewEngineINS3_8smem_ptrIfEEEENS7_INS8_IJNS9_ILi256EEEEEENS8_IJSB_EEEEEEEEEvRT2_RT3_fPDv4_f.exit.i ; in Loop: Header=BB1_5 Depth=1 s_or_b64 exec, exec, s[26:27] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v158, v123 offset:512 s_add_i32 s40, s11, -2 s_waitcnt lgkmcnt(0) v_mul_f32_e32 v44, v158, v112 v_cmp_lg_f32_e32 vcc, s77, v158 v_cndmask_b32_e32 v44, 0, v44, vcc v_fma_f32 v0, v0, v112, -v44 v_fma_f32 v43, v43, v112, -v44 v_exp_f32_e32 v107, v0 v_fma_f32 v0, v1, v112, -v44 v_exp_f32_e32 v106, v43 v_exp_f32_e32 v154, v0 v_fma_f32 v0, v2, v112, -v44 s_nop 0 v_exp_f32_e32 v156, v0 v_fma_f32 v0, v3, v112, -v44 s_nop 0 v_exp_f32_e32 v179, v0 v_fma_f32 v0, v40, v112, -v44 s_nop 0 v_exp_f32_e32 v181, v0 v_fma_f32 v0, v41, v112, -v44 s_nop 0 v_exp_f32_e32 v183, v0 v_fma_f32 v0, v42, v112, -v44 s_nop 0 v_exp_f32_e32 v184, v0 ; sched_barrier mask(0x00000000) v_mov_b32_e32 v0, v106 v_cvt_pk_fp8_f32 v0, v0, v107, s0 v_mov_b32_e32 v1, v179 v_cvt_pk_fp8_f32 v1, v1, v181, s0 v_mov_b32_e32 v2, v154 v_cvt_pk_fp8_f32 v2, v2, v156, v0 op_sel:[0,0,0,1] v_mov_b32_e32 v0, v183 v_cvt_pk_fp8_f32 v0, v0, v184, v1 op_sel:[0,0,0,1] v_add_u32_e32 v1, 0, v125 v_add_u32_e32 v1, 0xa000, v1 ds_write2_b32 v1, v2, v0 offset1:2 v_add_u32_e32 v0, 0, v124 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b128 v[100:103], v0 offset:40960 ; sched_barrier mask(0x00000000) ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[54:56:58:60], v126 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[55:57:59:61], v115 ds_read_m64x16_b8_alt4 v[62:64:66:68], v116 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[63:65:67:69], v117 ds_read_m64x16_b8_alt4 v[70:72:74:76], v138 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[71:73:75:77], v139 ds_read_m64x16_b8_alt4 v[78:80:82:84], v140 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[79:81:83:85], v141 ds_read_m64x16_b8_alt4 v[86:88:90:92], v142 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[87:89:91:93], v143 ds_read_m64x16_b8_alt4 v[155:157:159:161], v144 ds_read_m64x16_b8_alt4 v[160:162:164:166], v145 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[163:165:167:169], v146 ds_read_m64x16_b8_alt4 v[168:170:172:174], v147 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[171:173:175:177], v150 ds_read_m64x16_b8_alt4 v[176:178:180:182], v151 ; sched_barrier mask(0x00000000) s_mov_b32 s49, s48 s_mov_b32 s50, s48 s_mov_b32 s51, s48 v_mov_b64_e32 v[0:1], s[48:49] v_mov_b64_e32 v[2:3], s[50:51] v_mov_b32_e32 v40, v54 v_mov_b32_e32 v41, v55 v_mov_b64_e32 v[54:55], v[2:3] v_mov_b64_e32 v[52:53], v[0:1] v_mov_b64_e32 v[50:51], v[2:3] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[52:55], v[100:101], v[40:41], v[52:55] lit v_mov_b32_e32 v40, v56 v_mov_b32_e32 v41, v57 v_mov_b64_e32 v[48:49], v[0:1] v_mov_b64_e32 v[46:47], v[2:3] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[100:101], v[40:41], v[48:51] lit v_mov_b32_e32 v40, v58 v_mov_b32_e32 v41, v59 v_mov_b64_e32 v[44:45], v[0:1] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[100:101], v[40:41], v[44:47] lit v_mov_b64_e32 v[42:43], v[2:3] v_mov_b32_e32 v56, v60 v_mov_b32_e32 v57, v61 v_mov_b64_e32 v[40:41], v[0:1] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[40:43], v[100:101], v[56:57], v[40:43] lit v_mov_b32_e32 v56, v62 v_mov_b32_e32 v57, v63 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[52:55], v[102:103], v[56:57], v[52:55] lit v_mov_b32_e32 v56, v64 v_mov_b32_e32 v57, v65 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[102:103], v[56:57], v[48:51] lit v_mov_b32_e32 v56, v66 v_mov_b32_e32 v57, v67 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[102:103], v[56:57], v[44:47] lit v_mov_b32_e32 v56, v68 v_mov_b32_e32 v57, v69 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[40:43], v[102:103], v[56:57], v[40:43] lit v_mov_b32_e32 v56, v70 v_mov_b32_e32 v57, v71 v_mov_b64_e32 v[70:71], v[2:3] v_mov_b64_e32 v[68:69], v[0:1] v_mov_b64_e32 v[66:67], v[2:3] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[68:71], v[100:101], v[56:57], v[68:71] lit v_mov_b32_e32 v56, v72 v_mov_b32_e32 v57, v73 v_mov_b64_e32 v[64:65], v[0:1] v_mov_b64_e32 v[62:63], v[2:3] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[64:67], v[100:101], v[56:57], v[64:67] lit v_mov_b32_e32 v56, v74 v_mov_b32_e32 v57, v75 v_mov_b64_e32 v[60:61], v[0:1] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[60:63], v[100:101], v[56:57], v[60:63] lit v_mov_b64_e32 v[58:59], v[2:3] v_mov_b32_e32 v72, v76 v_mov_b32_e32 v73, v77 v_mov_b64_e32 v[56:57], v[0:1] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[56:59], v[100:101], v[72:73], v[56:59] lit v_mov_b32_e32 v72, v78 v_mov_b32_e32 v73, v79 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[68:71], v[102:103], v[72:73], v[68:71] lit v_mov_b32_e32 v72, v80 v_mov_b32_e32 v73, v81 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[64:67], v[102:103], v[72:73], v[64:67] lit v_mov_b32_e32 v72, v82 v_mov_b32_e32 v73, v83 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[60:63], v[102:103], v[72:73], v[60:63] lit v_mov_b32_e32 v72, v84 v_mov_b32_e32 v73, v85 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[56:59], v[102:103], v[72:73], v[56:59] lit v_mov_b32_e32 v72, v86 v_mov_b32_e32 v73, v87 v_mov_b64_e32 v[86:87], v[2:3] v_mov_b64_e32 v[84:85], v[0:1] v_mov_b64_e32 v[82:83], v[2:3] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[84:87], v[100:101], v[72:73], v[84:87] lit v_mov_b32_e32 v72, v88 v_mov_b32_e32 v73, v89 v_mov_b64_e32 v[80:81], v[0:1] v_mov_b64_e32 v[78:79], v[2:3] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[80:83], v[100:101], v[72:73], v[80:83] lit v_mov_b32_e32 v72, v90 v_mov_b32_e32 v73, v91 v_mov_b64_e32 v[76:77], v[0:1] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[76:79], v[100:101], v[72:73], v[76:79] lit v_mov_b64_e32 v[74:75], v[2:3] v_mov_b32_e32 v88, v92 v_mov_b32_e32 v89, v93 v_mov_b64_e32 v[72:73], v[0:1] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[72:75], v[100:101], v[88:89], v[72:75] lit v_mov_b32_e32 v88, v155 v_mov_b32_e32 v89, v160 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[84:87], v[102:103], v[88:89], v[84:87] lit v_mov_b32_e32 v88, v157 v_mov_b32_e32 v89, v162 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[80:83], v[102:103], v[88:89], v[80:83] lit v_mov_b32_e32 v88, v159 v_mov_b32_e32 v89, v164 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[76:79], v[102:103], v[88:89], v[76:79] lit v_mov_b32_e32 v88, v161 v_mov_b32_e32 v89, v166 v_mov_b64_e32 v[98:99], v[2:3] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[72:75], v[102:103], v[88:89], v[72:75] lit v_mov_b32_e32 v88, v163 v_mov_b32_e32 v89, v168 v_mov_b64_e32 v[96:97], v[0:1] v_mov_b64_e32 v[94:95], v[2:3] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[96:99], v[100:101], v[88:89], v[96:99] lit v_mov_b32_e32 v88, v165 v_mov_b32_e32 v89, v170 v_mov_b64_e32 v[92:93], v[0:1] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[92:95], v[100:101], v[88:89], v[92:95] lit v_mov_b64_e32 v[90:91], v[2:3] v_mov_b32_e32 v104, v167 s_waitcnt lgkmcnt(1) v_mov_b32_e32 v105, v172 v_mov_b64_e32 v[88:89], v[0:1] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[88:91], v[100:101], v[104:105], v[88:91] lit v_mov_b32_e32 v104, v169 v_mov_b32_e32 v105, v174 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[0:3], v[100:101], v[104:105], v[0:3] lit v_mov_b32_e32 v100, v171 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v101, v176 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[96:99], v[102:103], v[100:101], v[96:99] lit v_mov_b32_e32 v100, v173 v_mov_b32_e32 v101, v178 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[92:95], v[102:103], v[100:101], v[92:95] lit v_mov_b32_e32 v100, v175 v_mov_b32_e32 v101, v180 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[88:91], v[102:103], v[100:101], v[88:91] lit v_mov_b32_e32 v100, v177 v_mov_b32_e32 v101, v182 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[0:3], v[102:103], v[100:101], v[0:3] lit ;;#ASMSTART s_barrier ;;#ASMEND v_add_f32_e32 v100, v107, v106 v_add_f32_e32 v100, v100, v154 v_add_f32_e32 v100, v100, v156 v_add_f32_e32 v100, v100, v179 v_add_f32_e32 v100, v100, v181 v_add_f32_e32 v100, v100, v183 v_add_f32_e32 v154, v100, v184 s_cmp_lt_i32 s40, s80 s_cbranch_scc0 .LBB1_31 .LBB1_27: ; in Loop: Header=BB1_5 Depth=1 v_mov_b32_e32 v155, v158 .LBB1_28: ; %._crit_edge.i ; in Loop: Header=BB1_5 Depth=1 s_cmp_eq_u32 s80, 0 s_cselect_b64 s[26:27], -1, 0 s_add_i32 s40, s79, 63 s_ashr_i32 s41, s40, 31 s_lshr_b32 s41, s41, 26 s_add_i32 s40, s40, s41 s_ashr_i32 s40, s40, 6 s_cmp_eq_u32 s11, s40 s_cselect_b64 s[40:41], -1, 0 s_and_b64 s[26:27], s[26:27], s[40:41] s_andn2_b64 vcc, exec, s[26:27] s_mov_b64 s[26:27], -1 s_cbranch_vccnz .LBB1_42 ; %bb.29: ; %Flow279 ; in Loop: Header=BB1_5 Depth=1 s_and_b64 vcc, exec, s[26:27] s_cbranch_vccz .LBB1_4 s_branch .LBB1_51 .LBB1_30: ; in Loop: Header=BB1_5 Depth=1 s_mov_b32 s49, s48 s_mov_b32 s50, s48 s_mov_b32 s51, s48 v_mov_b64_e32 v[0:1], s[48:49] v_mov_b64_e32 v[2:3], s[50:51] v_mov_b64_e32 v[90:91], v[2:3] v_mov_b64_e32 v[94:95], v[2:3] v_mov_b64_e32 v[98:99], v[2:3] v_mov_b64_e32 v[74:75], v[2:3] v_mov_b64_e32 v[78:79], v[2:3] v_mov_b64_e32 v[82:83], v[2:3] v_mov_b64_e32 v[86:87], v[2:3] v_mov_b64_e32 v[58:59], v[2:3] v_mov_b64_e32 v[62:63], v[2:3] v_mov_b64_e32 v[66:67], v[2:3] v_mov_b64_e32 v[70:71], v[2:3] v_mov_b64_e32 v[42:43], v[2:3] v_mov_b64_e32 v[46:47], v[2:3] v_mov_b64_e32 v[50:51], v[2:3] v_mov_b64_e32 v[54:55], v[2:3] v_mov_b32_e32 v158, 0 v_mov_b64_e32 v[88:89], v[0:1] v_mov_b64_e32 v[92:93], v[0:1] v_mov_b64_e32 v[96:97], v[0:1] v_mov_b64_e32 v[72:73], v[0:1] v_mov_b64_e32 v[76:77], v[0:1] v_mov_b64_e32 v[80:81], v[0:1] v_mov_b64_e32 v[84:85], v[0:1] v_mov_b64_e32 v[56:57], v[0:1] v_mov_b64_e32 v[60:61], v[0:1] v_mov_b64_e32 v[64:65], v[0:1] v_mov_b64_e32 v[68:69], v[0:1] v_mov_b64_e32 v[40:41], v[0:1] v_mov_b64_e32 v[44:45], v[0:1] v_mov_b64_e32 v[48:49], v[0:1] v_mov_b64_e32 v[52:53], v[0:1] s_cmp_lt_i32 s40, s80 s_cbranch_scc1 .LBB1_27 .LBB1_31: ; %.lr.ph718.i ; in Loop: Header=BB1_5 Depth=1 s_add_i32 s54, s81, 0x2000 s_add_i32 s55, s81, 0x4000 s_add_i32 s82, s81, 0x6000 s_cmp_gt_i32 s83, 3 s_cselect_b64 s[26:27], -1, 0 s_lshl_b32 s41, s84, 10 s_add_i32 s84, s41, 0 s_mul_i32 s41, s86, s61 s_mul_hi_u32 s44, s86, s60 s_add_i32 s41, s44, s41 s_mul_i32 s87, s87, s60 s_add_i32 s83, s81, 0x8000 s_add_i32 s41, s41, s87 s_mul_i32 s86, s86, s60 s_add_u32 s86, s86, s78 s_addc_u32 s85, s41, s85 s_ashr_i32 s41, s40, 31 s_add_i32 s87, s40, 1 s_lshl_b64 s[40:41], s[40:41], 2 s_add_u32 s52, s89, s40 s_addc_u32 s53, s88, s41 ; implicit-def: $vgpr157 ; implicit-def: $vgpr156 .LBB1_32: ; Parent Loop BB1_5 Depth=1 ; => This Inner Loop Header: Depth=2 s_mov_b64 s[40:41], s[52:53] ;;#ASMSTART s_load_dword s44, s[40:41], 0x0 s_waitcnt lgkmcnt(0) ;;#ASMEND s_ashr_i32 s40, s44, 31 s_mul_i32 s41, s44, s59 s_mul_hi_u32 s45, s44, s58 s_add_i32 s41, s45, s41 s_mul_i32 s40, s40, s58 s_add_i32 s41, s41, s40 s_mul_i32 s44, s44, s58 s_add_u32 s44, s70, s44 s_addc_u32 s45, s71, s41 s_add_u32 s40, s44, s86 s_addc_u32 s41, s45, s85 s_nop 0 matrix_load_64x16_b8 s[40:43] s81 t r lds matrix_load_64x16_b8 s[40:43] s54 moffset:128 t r lds matrix_load_64x16_b8 s[40:43] s55 moffset:256 t r lds matrix_load_64x16_b8 s[40:43] s82 moffset:384 t r lds s_mov_b64 s[50:51], -1 s_and_b64 vcc, exec, s[26:27] s_cbranch_vccz .LBB1_34 ; %bb.33: ; in Loop: Header=BB1_32 Depth=2 v_readfirstlane_b32 s49, v121 s_ashr_i32 s50, s49, 31 s_lshr_b32 s50, s50, 30 s_add_i32 s50, s49, s50 s_and_b32 s51, s50, 0x3ffffc s_lshl_b32 s50, s50, 10 s_sub_i32 s49, s49, s51 s_and_b32 s50, s50, 0xfffff000 s_lshl_b32 s49, s49, 10 s_add_i32 s50, s50, 0 s_add_i32 s49, s50, s49 s_add_i32 s49, s49, 0x8000 v_mov_b32_e32 v100, -1 ;;#ASMSTART s_mov_b32 m0, s49 buffer_load_dwordx4 v100, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_mov_b64 s[50:51], 0 .LBB1_34: ; %Flow282 ; in Loop: Header=BB1_32 Depth=2 s_andn2_b64 vcc, exec, s[50:51] s_cbranch_vccnz .LBB1_36 ; %bb.35: ; in Loop: Header=BB1_32 Depth=2 matrix_load_64x16_b8 s[40:43] s83 moffset:512 t r lds .LBB1_36: ; in Loop: Header=BB1_32 Depth=2 ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND s_mov_b32 m0, s84 s_nop 0 ds_read_matrix_trans_format v[159:162], m0 element:1 row:3 col:1 s_mov_b32 s50, s48 s_mov_b32 s51, s48 s_mov_b32 s49, s48 v_mov_b64_e32 v[102:103], s[50:51] v_mov_b64_e32 v[100:101], s[48:49] v_mov_b64_e32 v[106:107], v[102:103] v_mov_b64_e32 v[104:105], v[100:101] ds_read_matrix_trans_format v[163:166], m0 offset:2048 element:1 row:3 col:1 ds_read_matrix_trans_format v[167:170], m0 offset:4096 element:1 row:3 col:1 ds_read_matrix_trans_format v[171:174], m0 offset:6144 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[4:5], v[159:160], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[6:7], v[161:162], v[104:107] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[4:5], v[163:164], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[6:7], v[165:166], v[100:103] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[8:9], v[167:168], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[10:11], v[169:170], v[104:107] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[8:9], v[171:172], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[10:11], v[173:174], v[100:103] lit ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[159:162], m0 offset:8192 element:1 row:3 col:1 ds_read_matrix_trans_format v[163:166], m0 offset:10240 element:1 row:3 col:1 ds_read_matrix_trans_format v[167:170], m0 offset:12288 element:1 row:3 col:1 ds_read_matrix_trans_format v[171:174], m0 offset:14336 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[12:13], v[159:160], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[14:15], v[161:162], v[104:107] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[12:13], v[163:164], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[14:15], v[165:166], v[100:103] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[16:17], v[167:168], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[18:19], v[169:170], v[104:107] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[16:17], v[171:172], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[18:19], v[173:174], v[100:103] lit ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[159:162], m0 offset:16384 element:1 row:3 col:1 ds_read_matrix_trans_format v[163:166], m0 offset:18432 element:1 row:3 col:1 ds_read_matrix_trans_format v[167:170], m0 offset:20480 element:1 row:3 col:1 ds_read_matrix_trans_format v[171:174], m0 offset:22528 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[20:21], v[159:160], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[22:23], v[161:162], v[104:107] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[20:21], v[163:164], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[22:23], v[165:166], v[100:103] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[24:25], v[167:168], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[26:27], v[169:170], v[104:107] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[24:25], v[171:172], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[26:27], v[173:174], v[100:103] lit ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[159:162], m0 offset:24576 element:1 row:3 col:1 ds_read_matrix_trans_format v[163:166], m0 offset:26624 element:1 row:3 col:1 ds_read_matrix_trans_format v[167:170], m0 offset:28672 element:1 row:3 col:1 ds_read_matrix_trans_format v[171:174], m0 offset:30720 element:1 row:3 col:1 s_waitcnt lgkmcnt(3) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[28:29], v[159:160], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[30:31], v[161:162], v[104:107] lit s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[28:29], v[163:164], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[30:31], v[165:166], v[100:103] lit s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[32:33], v[167:168], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[34:35], v[169:170], v[104:107] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[32:33], v[171:172], v[100:103] lit v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[34:35], v[173:174], v[100:103] lit ;;#ASMSTART s_waitcnt vmcnt(0) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[159:162], m0 offset:32768 element:1 row:3 col:1 ds_read_matrix_trans_format v[163:166], m0 offset:34816 element:1 row:3 col:1 v_mbcnt_hi_u32_b32 v155, -1, v152 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[36:37], v[159:160], v[104:107] lit v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[38:39], v[161:162], v[104:107] lit v_and_b32_e32 v160, 63, v155 v_and_b32_e32 v162, 64, v155 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[36:37], v[163:164], v[100:103] lit v_max3_f32 v159, v158, v104, v105 v_xor_b32_e32 v161, 32, v160 v_add_u32_e32 v162, 64, v162 v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[38:39], v[165:166], v[100:103] lit v_max3_f32 v159, v159, v106, v107 v_cmp_lt_i32_e32 vcc, v161, v162 s_nop 0 v_max3_f32 v159, v159, v100, v101 v_cndmask_b32_e32 v161, v155, v161, vcc v_max3_f32 v159, v159, v102, v103 v_lshlrev_b32_e32 v161, 2, v161 ds_bpermute_b32 v161, v161, v159 v_xor_b32_e32 v160, 16, v160 v_cmp_lt_i32_e32 vcc, v160, v162 v_cndmask_b32_e32 v155, v155, v160, vcc v_lshlrev_b32_e32 v160, 2, v155 s_waitcnt lgkmcnt(0) v_max_f32_e32 v155, v159, v161 ds_bpermute_b32 v159, v160, v155 s_and_saveexec_b64 s[40:41], s[2:3] s_cbranch_execz .LBB1_38 ; %bb.37: ; in Loop: Header=BB1_32 Depth=2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v155, v155, v159 ds_write_b32 v113, v155 .LBB1_38: ; in Loop: Header=BB1_32 Depth=2 s_or_b64 exec, exec, s[40:41] s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[40:41], s[4:5] s_cbranch_execz .LBB1_40 ; %bb.39: ; in Loop: Header=BB1_32 Depth=2 ds_read_b64 v[159:160], v114 s_waitcnt lgkmcnt(0) v_max_f32_e32 v155, v159, v160 ds_write_b32 v123, v155 offset:512 .LBB1_40: ; %_ZN5flash7SoftmaxILi1EE25softmax_rescale_o_fp8_tp1ILb0ELb0ELb1EN4cute6TensorINS3_13array_alignedIfLm8ELm16EEENS3_6LayoutINS3_5tupleIJNS3_1CILi4EEENS9_ILi1EEENS9_ILi2EEEEEENS8_IJSB_NS9_ILi0EEESA_EEEEEEENS4_INS3_10ViewEngineINS3_8smem_ptrIfEEEENS7_INS8_IJNS9_ILi256EEEEEENS8_IJSB_EEEEEEEEEvRT2_RT3_fPDv4_f.exit.i ; in Loop: Header=BB1_32 Depth=2 s_or_b64 exec, exec, s[40:41] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v155, v123 offset:512 s_waitcnt lgkmcnt(0) v_sub_f32_e32 v158, v158, v155 v_mul_f32_e32 v158, v158, v112 v_cmp_lg_f32_e32 vcc, s77, v155 v_exp_f32_e32 v158, v158 v_mov_b32_e32 v159, v158 v_pk_mul_f32 v[52:53], v[158:159], v[52:53] v_pk_mul_f32 v[54:55], v[158:159], v[54:55] v_pk_mul_f32 v[48:49], v[158:159], v[48:49] v_pk_mul_f32 v[50:51], v[158:159], v[50:51] v_pk_mul_f32 v[44:45], v[158:159], v[44:45] v_pk_mul_f32 v[46:47], v[158:159], v[46:47] v_pk_mul_f32 v[40:41], v[158:159], v[40:41] v_pk_mul_f32 v[42:43], v[158:159], v[42:43] v_pk_mul_f32 v[68:69], v[158:159], v[68:69] v_pk_mul_f32 v[70:71], v[158:159], v[70:71] v_pk_mul_f32 v[64:65], v[158:159], v[64:65] v_pk_mul_f32 v[66:67], v[158:159], v[66:67] v_pk_mul_f32 v[60:61], v[158:159], v[60:61] v_pk_mul_f32 v[62:63], v[158:159], v[62:63] v_pk_mul_f32 v[56:57], v[158:159], v[56:57] v_pk_mul_f32 v[58:59], v[158:159], v[58:59] v_pk_mul_f32 v[84:85], v[158:159], v[84:85] v_pk_mul_f32 v[86:87], v[158:159], v[86:87] v_pk_mul_f32 v[80:81], v[158:159], v[80:81] v_pk_mul_f32 v[82:83], v[158:159], v[82:83] v_pk_mul_f32 v[76:77], v[158:159], v[76:77] v_pk_mul_f32 v[78:79], v[158:159], v[78:79] v_pk_mul_f32 v[72:73], v[158:159], v[72:73] v_pk_mul_f32 v[74:75], v[158:159], v[74:75] v_pk_mul_f32 v[96:97], v[158:159], v[96:97] v_pk_mul_f32 v[98:99], v[158:159], v[98:99] v_pk_mul_f32 v[92:93], v[158:159], v[92:93] v_pk_mul_f32 v[94:95], v[158:159], v[94:95] v_pk_mul_f32 v[88:89], v[158:159], v[88:89] v_pk_mul_f32 v[90:91], v[158:159], v[90:91] v_pk_mul_f32 v[0:1], v[158:159], v[0:1] v_pk_mul_f32 v[2:3], v[158:159], v[2:3] v_mul_f32_e32 v159, v155, v112 v_cndmask_b32_e32 v159, 0, v159, vcc v_fma_f32 v104, v104, v112, -v159 v_fma_f32 v105, v105, v112, -v159 v_exp_f32_e32 v104, v104 v_exp_f32_e32 v105, v105 v_fma_f32 v106, v106, v112, -v159 v_fma_f32 v107, v107, v112, -v159 v_exp_f32_e32 v106, v106 v_exp_f32_e32 v107, v107 v_fma_f32 v100, v100, v112, -v159 v_fma_f32 v103, v103, v112, -v159 v_exp_f32_e32 v100, v100 v_fma_f32 v101, v101, v112, -v159 v_exp_f32_e32 v222, v103 v_fma_f32 v103, v158, v154, v104 v_exp_f32_e32 v101, v101 v_fma_f32 v102, v102, v112, -v159 v_add_f32_e32 v103, v103, v105 v_exp_f32_e32 v102, v102 v_add_f32_e32 v103, v103, v106 v_add_f32_e32 v103, v103, v107 v_add_f32_e32 v103, v103, v100 v_add_f32_e32 v103, v103, v101 v_add_f32_e32 v154, v103, v102 ; sched_barrier mask(0x00000000) v_cvt_pk_fp8_f32 v104, v104, v105, v157 v_cvt_pk_fp8_f32 v100, v100, v101, v156 v_mov_b32_e32 v157, v106 v_mov_b32_e32 v156, v102 v_cvt_pk_fp8_f32 v157, v157, v107, v104 op_sel:[0,0,0,1] v_cvt_pk_fp8_f32 v156, v156, v222, v100 op_sel:[0,0,0,1] v_add_u32_e32 v100, 0, v125 v_add_u32_e32 v100, 0xa000, v100 ds_write2_b32 v100, v157, v156 offset1:2 v_add_u32_e32 v100, 0, v124 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b128 v[100:103], v100 offset:40960 ; sched_barrier mask(0x00000000) ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[158:160:162:164], v126 # s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[159:161:163:165], v115 ds_read_m64x16_b8_alt4 v[166:168:170:172], v116 # s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[167:169:171:173], v117 ds_read_m64x16_b8_alt4 v[174:176:178:180], v138 # s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[175:177:179:181], v139 ds_read_m64x16_b8_alt4 v[182:184:186:188], v140 # s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[183:185:187:189], v141 ds_read_m64x16_b8_alt4 v[190:192:194:196], v142 # s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[191:193:195:197], v143 ds_read_m64x16_b8_alt4 v[198:200:202:204], v144 # s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[199:201:203:205], v145 ds_read_m64x16_b8_alt4 v[206:208:210:212], v146 # s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[207:209:211:213], v147 ds_read_m64x16_b8_alt4 v[214:216:218:220], v150 # s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[215:217:219:221], v151 s_waitcnt lgkmcnt(14) ; sched_barrier mask(0x00000000) # v_mov_b32_e32 v104, v158 # v_mov_b32_e32 v105, v159 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[52:55], v[100:101], v[158:159], v[52:55] lit # v_mov_b32_e32 v104, v160 # v_mov_b32_e32 v105, v161 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[100:101], v[160:161], v[48:51] lit # v_mov_b32_e32 v104, v162 # v_mov_b32_e32 v105, v163 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[100:101], v[162:163], v[44:47] lit # v_mov_b32_e32 v104, v164 # v_mov_b32_e32 v105, v165 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[40:43], v[100:101], v[164:165], v[40:43] lit s_waitcnt lgkmcnt(12) # v_mov_b32_e32 v104, v166 # v_mov_b32_e32 v105, v167 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[52:55], v[102:103], v[166:167], v[52:55] lit # v_mov_b32_e32 v104, v168 # v_mov_b32_e32 v105, v169 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[48:51], v[102:103], v[168:169], v[48:51] lit # v_mov_b32_e32 v104, v170 # v_mov_b32_e32 v105, v171 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[44:47], v[102:103], v[170:171], v[44:47] lit # v_mov_b32_e32 v104, v172 # v_mov_b32_e32 v105, v173 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[40:43], v[102:103], v[172:173], v[40:43] lit s_waitcnt lgkmcnt(10) # v_mov_b32_e32 v104, v174 # v_mov_b32_e32 v105, v175 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[68:71], v[100:101], v[174:175], v[68:71] lit # v_mov_b32_e32 v104, v176 # v_mov_b32_e32 v105, v177 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[64:67], v[100:101], v[176:177], v[64:67] lit # v_mov_b32_e32 v104, v178 # v_mov_b32_e32 v105, v179 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[60:63], v[100:101], v[178:179], v[60:63] lit # v_mov_b32_e32 v104, v180 # v_mov_b32_e32 v105, v181 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[56:59], v[100:101], v[180:181], v[56:59] lit s_waitcnt lgkmcnt(8) # v_mov_b32_e32 v104, v182 # v_mov_b32_e32 v105, v183 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[68:71], v[102:103], v[182:183], v[68:71] lit # v_mov_b32_e32 v104, v184 # v_mov_b32_e32 v105, v185 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[64:67], v[102:103], v[184:185], v[64:67] lit # v_mov_b32_e32 v104, v186 # v_mov_b32_e32 v105, v187 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[60:63], v[102:103], v[186:187], v[60:63] lit # v_mov_b32_e32 v104, v188 # v_mov_b32_e32 v105, v189 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[56:59], v[102:103], v[188:189], v[56:59] lit # v_mov_b32_e32 v104, v190 # v_mov_b32_e32 v105, v191 # s_nop 1 s_waitcnt lgkmcnt(6) v_mmac_f32_16x16x32_fp8_fp8 v[84:87], v[100:101], v[190:191], v[84:87] lit # v_mov_b32_e32 v104, v192 # v_mov_b32_e32 v105, v193 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[80:83], v[100:101], v[192:193], v[80:83] lit # v_mov_b32_e32 v104, v194 # v_mov_b32_e32 v105, v195 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[76:79], v[100:101], v[194:195], v[76:79] lit # v_mov_b32_e32 v104, v196 # v_mov_b32_e32 v105, v197 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[72:75], v[100:101], v[196:197], v[72:75] lit s_waitcnt lgkmcnt(4) # v_mov_b32_e32 v104, v198 # v_mov_b32_e32 v105, v199 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[84:87], v[102:103], v[198:199], v[84:87] lit # v_mov_b32_e32 v104, v200 # v_mov_b32_e32 v105, v201 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[80:83], v[102:103], v[200:201], v[80:83] lit # v_mov_b32_e32 v104, v202 # v_mov_b32_e32 v105, v203 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[76:79], v[102:103], v[202:203], v[76:79] lit # v_mov_b32_e32 v104, v204 # v_mov_b32_e32 v105, v205 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[72:75], v[102:103], v[204:205], v[72:75] lit # v_mov_b32_e32 v104, v206 # v_mov_b32_e32 v105, v207 # s_nop 1 s_waitcnt lgkmcnt(2) v_mmac_f32_16x16x32_fp8_fp8 v[96:99], v[100:101], v[206:207], v[96:99] lit # v_mov_b32_e32 v104, v208 # v_mov_b32_e32 v105, v209 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[92:95], v[100:101], v[208:209], v[92:95] lit # v_mov_b32_e32 v104, v210 # v_mov_b32_e32 v105, v211 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[88:91], v[100:101], v[210:211], v[88:91] lit # v_mov_b32_e32 v104, v212 # v_mov_b32_e32 v105, v213 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[0:3], v[100:101], v[212:213], v[0:3] lit # v_mov_b32_e32 v100, v214 # s_waitcnt lgkmcnt(0) # v_mov_b32_e32 v101, v215 # s_nop 1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[96:99], v[102:103], v[214:215], v[96:99] lit # v_mov_b32_e32 v100, v216 # v_mov_b32_e32 v101, v217 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[92:95], v[102:103], v[216:217], v[92:95] lit # v_mov_b32_e32 v100, v218 # v_mov_b32_e32 v101, v219 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[88:91], v[102:103], v[218:219], v[88:91] lit # v_mov_b32_e32 v100, v220 # v_mov_b32_e32 v101, v221 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[0:3], v[102:103], v[220:221], v[0:3] lit v_add_f32_e32 v154, v154, v222 ;;#ASMSTART s_barrier ;;#ASMEND s_add_i32 s87, s87, -1 s_add_u32 s52, s52, -4 s_addc_u32 s53, s53, -1 s_cmp_gt_i32 s87, s80 s_cbranch_scc0 .LBB1_28 ; %bb.41: ; in Loop: Header=BB1_32 Depth=2 v_mov_b32_e32 v158, v155 s_branch .LBB1_32 .LBB1_42: ; in Loop: Header=BB1_5 Depth=1 s_add_u32 s20, s24, s20 s_addc_u32 s21, s25, s21 global_load_dword v6, v108, s[20:21] v_mbcnt_hi_u32_b32 v4, -1, v152 v_and_b32_e32 v5, 63, v4 v_and_b32_e32 v8, 64, v4 v_xor_b32_e32 v7, 32, v5 v_add_u32_e32 v8, 64, v8 v_cmp_lt_i32_e32 vcc, v7, v8 v_cndmask_b32_e32 v7, v4, v7, vcc v_lshlrev_b32_e32 v7, 2, v7 ds_bpermute_b32 v7, v7, v154 v_xor_b32_e32 v5, 16, v5 v_cmp_lt_i32_e32 vcc, v5, v8 v_cndmask_b32_e32 v4, v4, v5, vcc v_lshlrev_b32_e32 v5, 2, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v7, v154 ds_bpermute_b32 v5, v5, v4 s_waitcnt vmcnt(0) v_readfirstlane_b32 s11, v6 s_and_saveexec_b64 s[20:21], s[2:3] s_cbranch_execz .LBB1_44 ; %bb.43: ; in Loop: Header=BB1_5 Depth=1 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_write_b32 v118, v4 .LBB1_44: ; in Loop: Header=BB1_5 Depth=1 s_or_b64 exec, exec, s[20:21] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[20:21], s[4:5] s_cbranch_execz .LBB1_46 ; %bb.45: ; in Loop: Header=BB1_5 Depth=1 ds_read_b64 v[4:5], v119 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v5, v4 ds_write_b32 v148, v4 offset:512 .LBB1_46: ; %_ZN5flash7SoftmaxILi1EE29normalize_softmax_lse_fp8_tp1ILb0ELb1ELb1EN4cute6TensorINS3_10ViewEngineINS3_8smem_ptrIfEEEENS3_6LayoutINS3_5tupleIJNS3_1CILi256EEEEEENSA_IJNSB_ILi1EEEEEEEEEEEENS4_INS3_13array_alignedIfLm1ELm16EEENS9_ISF_NSA_IJNSB_ILi0EEEEEEEEEEPDv4_fRT2_fff.exit.i ; in Loop: Header=BB1_5 Depth=1 s_or_b64 exec, exec, s[20:21] s_and_b64 s[8:9], s[8:9], exec s_cselect_b32 s8, s33, 0 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v4, v148 offset:512 s_add_i32 s8, s11, s8 s_mul_i32 s8, s8, s29 s_add_i32 s8, s8, s7 s_mul_i32 s8, s8, s35 s_add_i32 s20, s8, s6 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, 0, v4 s_and_saveexec_b64 s[26:27], s[30:31] s_cbranch_execz .LBB1_48 ; %bb.47: ; in Loop: Header=BB1_5 Depth=1 v_log_f32_e32 v5, v4 s_ashr_i32 s21, s20, 31 v_mov_b32_e32 v6, 0xff800000 s_lshl_b64 s[8:9], s[20:21], 2 v_mul_f32_e32 v5, 0x3f317218, v5 v_fmac_f32_e32 v5, v155, v111 v_cndmask_b32_e32 v7, v5, v6, vcc v_mov_b32_e32 v6, s9 v_add_co_u32_e64 v5, s[8:9], s8, v127 v_addc_co_u32_e64 v6, s[8:9], v128, v6, s[8:9] global_store_dword v[5:6], v7, off .LBB1_48: ; %.loopexit677.i ; in Loop: Header=BB1_5 Depth=1 s_or_b64 exec, exec, s[26:27] s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB1_50 ; %bb.49: ; %.preheader674.i ; in Loop: Header=BB1_5 Depth=1 v_rcp_f32_e32 v7, v4 v_mov_b32_e32 v6, v53 v_mov_b32_e32 v22, v69 v_mov_b32_e32 v23, v65 v_mul_f32_e32 v7, s34, v7 v_cndmask_b32_e64 v174, v7, 1.0, vcc v_mov_b32_e32 v175, v174 v_mov_b32_e32 v7, v49 v_mov_b32_e32 v38, v85 v_mov_b32_e32 v39, v81 v_mov_b32_e32 v162, v97 v_mov_b32_e32 v163, v93 v_pk_mul_f32 v[8:9], v[174:175], v[6:7] v_mov_b32_e32 v6, v54 v_mov_b32_e32 v7, v50 v_pk_mul_f32 v[24:25], v[174:175], v[22:23] v_mov_b32_e32 v22, v70 v_mov_b32_e32 v23, v66 v_pk_mul_f32 v[100:101], v[174:175], v[38:39] v_mov_b32_e32 v38, v86 v_mov_b32_e32 v39, v82 v_pk_mul_f32 v[164:165], v[174:175], v[162:163] v_mov_b32_e32 v162, v98 v_mov_b32_e32 v163, v94 v_pk_mul_f32 v[12:13], v[174:175], v[6:7] v_mov_b32_e32 v6, v55 v_mov_b32_e32 v7, v51 v_pk_mul_f32 v[28:29], v[174:175], v[22:23] v_mov_b32_e32 v22, v71 v_mov_b32_e32 v23, v67 v_pk_mul_f32 v[104:105], v[174:175], v[38:39] v_mov_b32_e32 v38, v87 v_mov_b32_e32 v39, v83 v_pk_mul_f32 v[168:169], v[174:175], v[162:163] v_mov_b32_e32 v162, v99 v_mov_b32_e32 v163, v95 v_mov_b32_e32 v4, v52 v_mov_b32_e32 v5, v48 v_pk_mul_f32 v[16:17], v[174:175], v[6:7] v_mov_b32_e32 v6, v44 v_mov_b32_e32 v7, v40 v_mov_b32_e32 v10, v45 v_mov_b32_e32 v11, v41 v_mov_b32_e32 v14, v46 v_mov_b32_e32 v15, v42 v_mov_b32_e32 v18, v47 v_mov_b32_e32 v19, v43 v_mov_b32_e32 v20, v68 v_mov_b32_e32 v21, v64 v_pk_mul_f32 v[32:33], v[174:175], v[22:23] v_mov_b32_e32 v22, v60 v_mov_b32_e32 v23, v56 v_mov_b32_e32 v26, v61 v_mov_b32_e32 v27, v57 v_mov_b32_e32 v30, v62 v_mov_b32_e32 v31, v58 v_mov_b32_e32 v34, v63 v_mov_b32_e32 v35, v59 v_mov_b32_e32 v36, v84 v_mov_b32_e32 v37, v80 v_pk_mul_f32 v[156:157], v[174:175], v[38:39] v_mov_b32_e32 v38, v76 v_mov_b32_e32 v39, v72 v_mov_b32_e32 v102, v77 v_mov_b32_e32 v103, v73 v_mov_b32_e32 v106, v78 v_mov_b32_e32 v107, v74 v_mov_b32_e32 v158, v79 v_mov_b32_e32 v159, v75 v_mov_b32_e32 v160, v96 v_mov_b32_e32 v161, v92 v_pk_mul_f32 v[172:173], v[174:175], v[162:163] v_mov_b32_e32 v162, v88 v_mov_b32_e32 v163, v0 v_mov_b32_e32 v166, v89 v_mov_b32_e32 v167, v1 v_mov_b32_e32 v170, v90 v_mov_b32_e32 v171, v2 v_mov_b32_e32 v176, v91 v_mov_b32_e32 v177, v3 s_mul_i32 s20, s20, s28 v_or_b32_e32 v178, s78, v120 v_pk_mul_f32 v[4:5], v[174:175], v[4:5] v_pk_mul_f32 v[6:7], v[174:175], v[6:7] v_pk_mul_f32 v[10:11], v[174:175], v[10:11] v_pk_mul_f32 v[14:15], v[174:175], v[14:15] v_pk_mul_f32 v[18:19], v[174:175], v[18:19] v_pk_mul_f32 v[20:21], v[174:175], v[20:21] v_pk_mul_f32 v[22:23], v[174:175], v[22:23] v_pk_mul_f32 v[26:27], v[174:175], v[26:27] v_pk_mul_f32 v[30:31], v[174:175], v[30:31] v_pk_mul_f32 v[34:35], v[174:175], v[34:35] v_pk_mul_f32 v[36:37], v[174:175], v[36:37] v_pk_mul_f32 v[38:39], v[174:175], v[38:39] v_pk_mul_f32 v[102:103], v[174:175], v[102:103] v_pk_mul_f32 v[106:107], v[174:175], v[106:107] v_pk_mul_f32 v[158:159], v[174:175], v[158:159] v_pk_mul_f32 v[160:161], v[174:175], v[160:161] v_pk_mul_f32 v[162:163], v[174:175], v[162:163] v_pk_mul_f32 v[166:167], v[174:175], v[166:167] v_pk_mul_f32 v[170:171], v[174:175], v[170:171] v_pk_mul_f32 v[174:175], v[174:175], v[176:177] s_ashr_i32 s21, s20, 31 v_ashrrev_i32_e32 v177, 31, v178 v_add_co_u32_e32 v176, vcc, v178, v149 s_lshl_b64 s[20:21], s[20:21], 2 v_addc_co_u32_e32 v177, vcc, 0, v177, vcc s_add_u32 s11, s64, s20 v_lshlrev_b64 v[176:177], 2, v[176:177] s_addc_u32 s20, s65, s21 v_mov_b32_e32 v179, s20 v_add_co_u32_e32 v176, vcc, s11, v176 v_addc_co_u32_e32 v177, vcc, v179, v177, vcc global_store_dwordx4 v[176:177], v[4:7], off global_store_dwordx4 v[176:177], v[8:11], off offset:16 global_store_dwordx4 v[176:177], v[12:15], off offset:32 global_store_dwordx4 v[176:177], v[16:19], off offset:48 v_add_u32_e32 v4, 0x80, v178 v_ashrrev_i32_e32 v5, 31, v4 v_add_co_u32_e32 v4, vcc, v4, v149 v_addc_co_u32_e32 v5, vcc, 0, v5, vcc v_lshlrev_b64 v[4:5], 2, v[4:5] v_mov_b32_e32 v6, s20 v_add_co_u32_e32 v4, vcc, s11, v4 v_addc_co_u32_e32 v5, vcc, v6, v5, vcc global_store_dwordx4 v[4:5], v[20:23], off global_store_dwordx4 v[4:5], v[24:27], off offset:16 global_store_dwordx4 v[4:5], v[28:31], off offset:32 global_store_dwordx4 v[4:5], v[32:35], off offset:48 v_add_u32_e32 v4, 0x100, v178 v_ashrrev_i32_e32 v5, 31, v4 v_add_co_u32_e32 v4, vcc, v4, v149 v_addc_co_u32_e32 v5, vcc, 0, v5, vcc v_lshlrev_b64 v[4:5], 2, v[4:5] v_add_co_u32_e32 v4, vcc, s11, v4 v_addc_co_u32_e32 v5, vcc, v6, v5, vcc global_store_dwordx4 v[4:5], v[36:39], off global_store_dwordx4 v[4:5], v[100:103], off offset:16 global_store_dwordx4 v[4:5], v[104:107], off offset:32 global_store_dwordx4 v[4:5], v[156:159], off offset:48 v_add_u32_e32 v4, 0x180, v178 v_ashrrev_i32_e32 v5, 31, v4 v_add_co_u32_e32 v4, vcc, v4, v149 v_addc_co_u32_e32 v5, vcc, 0, v5, vcc v_lshlrev_b64 v[4:5], 2, v[4:5] v_add_co_u32_e32 v4, vcc, s11, v4 v_addc_co_u32_e32 v5, vcc, v6, v5, vcc global_store_dwordx4 v[4:5], v[160:163], off global_store_dwordx4 v[4:5], v[164:167], off offset:16 global_store_dwordx4 v[4:5], v[168:171], off offset:32 global_store_dwordx4 v[4:5], v[172:175], off offset:48 .LBB1_50: ; %Flow ; in Loop: Header=BB1_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_mov_b64 s[26:27], 0 s_branch .LBB1_4 .LBB1_51: ; in Loop: Header=BB1_5 Depth=1 v_mbcnt_hi_u32_b32 v4, -1, v152 v_and_b32_e32 v5, 63, v4 v_and_b32_e32 v7, 64, v4 v_xor_b32_e32 v6, 32, v5 v_add_u32_e32 v7, 64, v7 v_cmp_lt_i32_e32 vcc, v6, v7 v_cndmask_b32_e32 v6, v4, v6, vcc v_lshlrev_b32_e32 v6, 2, v6 ds_bpermute_b32 v6, v6, v154 v_xor_b32_e32 v5, 16, v5 v_cmp_lt_i32_e32 vcc, v5, v7 v_cndmask_b32_e32 v4, v4, v5, vcc v_lshlrev_b32_e32 v5, 2, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v6, v154 ds_bpermute_b32 v5, v5, v4 s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execz .LBB1_53 ; %bb.52: ; in Loop: Header=BB1_5 Depth=1 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_write_b32 v118, v4 .LBB1_53: ; in Loop: Header=BB1_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[8:9], s[4:5] s_cbranch_execz .LBB1_55 ; %bb.54: ; in Loop: Header=BB1_5 Depth=1 ds_read_b64 v[4:5], v119 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v5, v4 ds_write_b32 v148, v4 offset:512 .LBB1_55: ; %_ZN5flash7SoftmaxILi1EE29normalize_softmax_lse_fp8_tp1ILb0ELb0ELb1EN4cute6TensorINS3_10ViewEngineINS3_8smem_ptrIfEEEENS3_6LayoutINS3_5tupleIJNS3_1CILi256EEEEEENSA_IJNSB_ILi1EEEEEEEEEEEENS4_INS3_13array_alignedIfLm1ELm16EEENS9_ISF_NSA_IJNSB_ILi0EEEEEEEEEEPDv4_fRT2_fff.exit.i ; in Loop: Header=BB1_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v4, v148 offset:512 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, 0, v4 s_and_saveexec_b64 s[20:21], s[30:31] s_cbranch_execz .LBB1_57 ; %bb.56: ; in Loop: Header=BB1_5 Depth=1 s_mul_i32 s8, s18, s29 v_log_f32_e32 v5, v4 s_add_i32 s8, s8, s7 s_mul_i32 s8, s8, s35 s_add_i32 s8, s8, s6 s_ashr_i32 s9, s8, 31 v_mul_f32_e32 v5, 0x3f317218, v5 v_fmac_f32_e32 v5, v155, v111 v_mov_b32_e32 v6, 0x7f800000 s_lshl_b64 s[8:9], s[8:9], 2 v_cndmask_b32_e32 v7, v5, v6, vcc v_mov_b32_e32 v6, s9 v_add_co_u32_e64 v5, s[8:9], s8, v129 v_addc_co_u32_e64 v6, s[8:9], v130, v6, s[8:9] global_store_dword v[5:6], v7, off .LBB1_57: ; %.loopexit.i ; in Loop: Header=BB1_5 Depth=1 s_or_b64 exec, exec, s[20:21] s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB1_3 ; %bb.58: ; %.preheader.i ; in Loop: Header=BB1_5 Depth=1 s_mul_i32 s11, s18, s69 s_mul_hi_u32 s20, s18, s68 v_rcp_f32_e32 v4, v4 s_add_i32 s11, s20, s11 s_mul_i32 s19, s19, s68 s_add_i32 s11, s11, s19 s_mul_i32 s19, s18, s68 s_add_u32 s20, s73, s19 s_addc_u32 s21, s75, s11 v_mul_f32_e32 v4, s34, v4 s_lshl_b64 s[20:21], s[20:21], 1 v_cndmask_b32_e64 v4, v4, 1.0, vcc s_add_u32 s11, s16, s20 v_mul_f32_e32 v13, v4, v44 v_mul_f32_e32 v16, v4, v40 s_addc_u32 s19, s17, s21 v_mul_f32_e32 v5, v4, v52 v_mul_f32_e32 v6, v4, v53 v_mul_f32_e32 v9, v4, v48 v_mul_f32_e32 v10, v4, v49 v_mul_f32_e32 v14, v4, v45 v_mul_f32_e32 v19, v4, v41 v_mul_f32_e32 v22, v4, v68 v_mul_f32_e32 v29, v4, v67 v_mul_f32_e32 v67, v4, v1 v_mul_f32_e32 v68, v4, v2 v_cvt_pk_bf16_f32 v1, v13, v16 v_mov_b32_e32 v2, s19 v_add_co_u32_e32 v16, vcc, s11, v109 v_mul_f32_e32 v7, v4, v54 v_mul_f32_e32 v8, v4, v55 v_mul_f32_e32 v11, v4, v50 v_mul_f32_e32 v12, v4, v51 v_mul_f32_e32 v15, v4, v46 v_mul_f32_e32 v18, v4, v47 v_mul_f32_e32 v20, v4, v42 v_mul_f32_e32 v21, v4, v43 v_mul_f32_e32 v23, v4, v69 v_mul_f32_e32 v28, v4, v66 v_mul_f32_e32 v66, v4, v0 v_mul_f32_e32 v69, v4, v3 v_cvt_pk_bf16_f32 v0, v5, v9 v_addc_co_u32_e32 v17, vcc, v2, v110, vcc v_cvt_pk_bf16_f32 v2, v6, v10 v_cvt_pk_bf16_f32 v3, v14, v19 v_mul_f32_e32 v24, v4, v70 v_mul_f32_e32 v25, v4, v71 v_mul_f32_e32 v26, v4, v64 v_mul_f32_e32 v27, v4, v65 v_mul_f32_e32 v30, v4, v60 v_mul_f32_e32 v31, v4, v61 v_mul_f32_e32 v32, v4, v62 v_mul_f32_e32 v33, v4, v63 v_mul_f32_e32 v34, v4, v56 v_mul_f32_e32 v35, v4, v57 v_mul_f32_e32 v36, v4, v58 v_mul_f32_e32 v37, v4, v59 v_mul_f32_e32 v38, v4, v84 v_mul_f32_e32 v39, v4, v85 v_mul_f32_e32 v40, v4, v86 v_mul_f32_e32 v41, v4, v87 v_mul_f32_e32 v42, v4, v80 v_mul_f32_e32 v43, v4, v81 v_mul_f32_e32 v44, v4, v82 v_mul_f32_e32 v45, v4, v83 v_mul_f32_e32 v46, v4, v76 v_mul_f32_e32 v47, v4, v77 v_mul_f32_e32 v48, v4, v78 v_mul_f32_e32 v49, v4, v79 v_mul_f32_e32 v50, v4, v72 v_mul_f32_e32 v51, v4, v73 v_mul_f32_e32 v52, v4, v74 v_mul_f32_e32 v53, v4, v75 v_mul_f32_e32 v54, v4, v96 v_mul_f32_e32 v55, v4, v97 v_mul_f32_e32 v56, v4, v98 v_mul_f32_e32 v57, v4, v99 v_mul_f32_e32 v58, v4, v92 v_mul_f32_e32 v59, v4, v93 v_mul_f32_e32 v60, v4, v94 v_mul_f32_e32 v61, v4, v95 v_mul_f32_e32 v62, v4, v88 v_mul_f32_e32 v63, v4, v89 v_mul_f32_e32 v64, v4, v90 v_mul_f32_e32 v65, v4, v91 v_cvt_pk_bf16_f32 v4, v7, v11 v_cvt_pk_bf16_f32 v5, v15, v20 v_cvt_pk_bf16_f32 v6, v8, v12 v_cvt_pk_bf16_f32 v7, v18, v21 v_cvt_pk_bf16_f32 v8, v22, v26 v_cvt_pk_bf16_f32 v9, v30, v34 v_cvt_pk_bf16_f32 v10, v23, v27 v_cvt_pk_bf16_f32 v11, v31, v35 v_cvt_pk_bf16_f32 v12, v24, v28 v_cvt_pk_bf16_f32 v13, v32, v36 v_cvt_pk_bf16_f32 v14, v25, v29 v_cvt_pk_bf16_f32 v15, v33, v37 global_store_dwordx4 v[16:17], v[0:3], off global_store_dwordx4 v[16:17], v[4:7], off offset:16 global_store_dwordx4 v[16:17], v[8:11], off offset:256 global_store_dwordx4 v[16:17], v[12:15], off offset:272 v_cvt_pk_bf16_f32 v0, v38, v42 v_cvt_pk_bf16_f32 v1, v46, v50 v_cvt_pk_bf16_f32 v2, v39, v43 v_cvt_pk_bf16_f32 v3, v47, v51 v_cvt_pk_bf16_f32 v4, v40, v44 v_cvt_pk_bf16_f32 v5, v48, v52 v_cvt_pk_bf16_f32 v6, v41, v45 v_cvt_pk_bf16_f32 v7, v49, v53 v_cvt_pk_bf16_f32 v8, v54, v58 v_cvt_pk_bf16_f32 v9, v62, v66 v_cvt_pk_bf16_f32 v10, v55, v59 v_cvt_pk_bf16_f32 v11, v63, v67 v_cvt_pk_bf16_f32 v12, v56, v60 v_cvt_pk_bf16_f32 v13, v64, v68 v_cvt_pk_bf16_f32 v14, v57, v61 v_cvt_pk_bf16_f32 v15, v65, v69 global_store_dwordx4 v[16:17], v[0:3], off offset:512 global_store_dwordx4 v[16:17], v[4:7], off offset:528 global_store_dwordx4 v[16:17], v[8:11], off offset:768 global_store_dwordx4 v[16:17], v[12:15], off offset:784 s_branch .LBB1_3 .LBB1_59: ; %.loopexit s_endpgm .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params .amdhsa_group_segment_fixed_size 0 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 336 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 1 .amdhsa_system_sgpr_workgroup_id_z 1 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 223 .amdhsa_next_free_sgpr 90 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_bf16_overflow 0 .amdhsa_bf8_overflow 1 .amdhsa_fp8_overflow 1 .amdhsa_bf16_denorm_mode 3 .amdhsa_load_store_out_of_order 0 .amdhsa_matrix_excp 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params,#alloc,#execinstr .Lfunc_end1: .size _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params, .Lfunc_end1-_ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 10572 ; NumSgprs: 94 ; NumVgprs: 223 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 0 bytes/workgroup (compile time only) ; SGPRBlocks: 11 ; VGPRBlocks: 55 ; NumSGPRsForWavesPerEU: 94 ; NumVGPRsForWavesPerEU: 223 ; Occupancy: 1 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF16_OVFL: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:FP8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:BF16_DENORM: 3 .section .text._ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi32ELi128EEEv20Flash_fwd_mla_params,#alloc,#execinstr .protected _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi32ELi128EEEv20Flash_fwd_mla_params ; -- Begin function _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi32ELi128EEEv20Flash_fwd_mla_params .globl _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi32ELi128EEEv20Flash_fwd_mla_params .p2align 8 .type _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi32ELi128EEEv20Flash_fwd_mla_params,@function _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi32ELi128EEEv20Flash_fwd_mla_params: ; @_ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi32ELi128EEEv20Flash_fwd_mla_params ; %bb.0: s_load_dword s26, s[4:5], 0x4 s_load_dword s27, s[4:5], 0x10 s_load_dwordx2 s[2:3], s[4:5], 0x120 s_ashr_i32 s7, s6, 31 s_waitcnt lgkmcnt(0) s_mul_i32 s27, s27, s26 s_ashr_i32 s0, s27, 31 s_add_i32 s1, s27, s0 s_xor_b32 s16, s1, s0 v_cvt_f32_u32_e32 v1, s16 s_xor_b32 s10, s7, s0 s_sub_i32 s0, 0, s16 s_add_i32 s1, s6, s7 v_rcp_iflag_f32_e32 v1, v1 s_xor_b32 s1, s1, s7 v_mul_f32_e32 v1, 0x4f7ffffe, v1 v_cvt_u32_f32_e32 v1, v1 v_readfirstlane_b32 s8, v1 s_mul_i32 s0, s0, s8 s_mul_hi_u32 s0, s8, s0 s_add_i32 s8, s8, s0 s_mul_hi_u32 s0, s1, s8 s_mul_i32 s8, s0, s16 s_sub_i32 s1, s1, s8 s_add_i32 s9, s0, 1 s_sub_i32 s8, s1, s16 s_cmp_ge_u32 s1, s16 s_cselect_b32 s11, s9, s0 s_cselect_b32 s17, s8, s1 s_add_i32 s12, s11, 1 s_cmp_ge_u32 s17, s16 s_cselect_b64 s[0:1], -1, 0 s_and_b64 s[8:9], s[0:1], exec s_cselect_b32 s8, s12, s11 s_xor_b32 s8, s8, s10 s_sub_i32 s8, s8, s10 s_ashr_i32 s9, s8, 31 s_lshl_b64 s[10:11], s[8:9], 2 s_add_u32 s2, s2, s10 s_addc_u32 s3, s3, s11 s_load_dwordx2 s[20:21], s[2:3], 0x0 s_waitcnt lgkmcnt(0) s_sub_i32 s28, s21, s20 s_cmp_eq_u32 s28, 1 s_cbranch_scc1 .LBB2_17 ; %bb.1: s_load_dwordx2 s[12:13], s[4:5], 0x58 s_load_dwordx2 s[14:15], s[4:5], 0x90 s_load_dwordx2 s[2:3], s[4:5], 0xc0 s_load_dwordx2 s[10:11], s[4:5], 0xf0 s_load_dwordx2 s[18:19], s[4:5], 0x130 s_sub_i32 s16, s17, s16 s_and_b64 s[0:1], s[0:1], exec s_cselect_b32 s0, s16, s17 s_xor_b32 s0, s0, s7 s_sub_i32 s0, s0, s7 s_mul_i32 s1, s20, s27 s_add_i32 s16, s0, s1 v_cmp_gt_u32_e32 vcc, 64, v0 s_and_saveexec_b64 s[22:23], vcc s_cbranch_execz .LBB2_8 ; %bb.2: v_cmp_gt_i32_e32 vcc, s28, v0 v_mov_b32_e32 v1, 0xff800000 s_and_saveexec_b64 s[24:25], vcc s_cbranch_execz .LBB2_4 ; %bb.3: s_load_dwordx2 s[0:1], s[4:5], 0x128 v_mul_lo_u32 v1, s27, v0 s_ashr_i32 s17, s16, 31 s_lshl_b64 s[30:31], s[16:17], 2 v_ashrrev_i32_e32 v2, 31, v1 s_waitcnt lgkmcnt(0) s_add_u32 s0, s0, s30 v_lshlrev_b64 v[1:2], 2, v[1:2] s_addc_u32 s1, s1, s31 v_mov_b32_e32 v3, s1 v_add_co_u32_e64 v1, s[0:1], s0, v1 v_addc_co_u32_e64 v2, s[0:1], v3, v2, s[0:1] global_load_dword v1, v[1:2], off .LBB2_4: ; %.critedge s_or_b64 exec, exec, s[24:25] v_mbcnt_lo_u32_b32 v3, -1, 0 v_mbcnt_hi_u32_b32 v3, -1, v3 v_and_b32_e32 v4, 63, v3 v_and_b32_e32 v5, 64, v3 v_add_u32_e32 v5, 64, v5 v_xor_b32_e32 v6, 32, v4 v_cmp_lt_i32_e64 s[0:1], v6, v5 v_cndmask_b32_e64 v6, v3, v6, s[0:1] s_waitcnt vmcnt(0) v_max_f32_e32 v2, 0xff800000, v1 v_lshlrev_b32_e32 v6, 2, v6 ds_bpermute_b32 v7, v6, v2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v7 v_xor_b32_e32 v7, 16, v4 v_cmp_lt_i32_e64 s[0:1], v7, v5 v_cndmask_b32_e64 v7, v3, v7, s[0:1] v_lshlrev_b32_e32 v7, 2, v7 ds_bpermute_b32 v8, v7, v2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v8 v_xor_b32_e32 v8, 8, v4 v_cmp_lt_i32_e64 s[0:1], v8, v5 v_cndmask_b32_e64 v8, v3, v8, s[0:1] v_lshlrev_b32_e32 v8, 2, v8 ds_bpermute_b32 v9, v8, v2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v9 v_xor_b32_e32 v9, 4, v4 v_cmp_lt_i32_e64 s[0:1], v9, v5 v_cndmask_b32_e64 v9, v3, v9, s[0:1] v_lshlrev_b32_e32 v9, 2, v9 ds_bpermute_b32 v10, v9, v2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v10 v_xor_b32_e32 v10, 2, v4 v_cmp_lt_i32_e64 s[0:1], v10, v5 v_cndmask_b32_e64 v10, v3, v10, s[0:1] v_lshlrev_b32_e32 v10, 2, v10 ds_bpermute_b32 v11, v10, v2 v_xor_b32_e32 v4, 1, v4 v_cmp_lt_i32_e64 s[0:1], v4, v5 v_cndmask_b32_e64 v3, v3, v4, s[0:1] v_lshlrev_b32_e32 v3, 2, v3 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v11 ds_bpermute_b32 v4, v3, v2 s_mov_b32 s0, 0xff800000 v_mov_b32_e32 v11, 0x42800000 v_mov_b32_e32 v5, 0x114b4ea4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v4 v_cmp_lg_f32_e64 s[0:1], s0, v2 v_cndmask_b32_e64 v2, 0, v2, s[0:1] v_sub_f32_e32 v4, v1, v2 s_mov_b32 s0, 0xc2aeac50 v_cmp_gt_f32_e64 s[0:1], s0, v4 v_cndmask_b32_e64 v11, 0, v11, s[0:1] v_add_f32_e32 v4, v4, v11 v_mul_f32_e32 v4, 0x3fb8aa3b, v4 v_cndmask_b32_e64 v5, 1.0, v5, s[0:1] v_exp_f32_e32 v4, v4 v_mul_f32_e32 v11, v5, v4 ds_bpermute_b32 v6, v6, v11 s_waitcnt lgkmcnt(0) v_fmac_f32_e32 v6, v5, v4 ds_bpermute_b32 v4, v7, v6 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v6, v4 ds_bpermute_b32 v5, v8, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v9, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v10, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v3, v3, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v3 v_cmp_lg_f32_e64 s[0:1], 0, v4 v_mov_b32_e32 v3, 0x7f800000 s_and_saveexec_b64 s[24:25], s[0:1] s_cbranch_execnz .LBB2_18 ; %bb.5: s_or_b64 exec, exec, s[24:25] v_cmp_eq_u32_e64 s[0:1], 0, v0 s_and_saveexec_b64 s[24:25], s[0:1] s_cbranch_execnz .LBB2_19 .LBB2_6: s_or_b64 exec, exec, s[24:25] s_and_b64 exec, exec, vcc s_cbranch_execz .LBB2_8 .LBB2_7: v_sub_f32_e32 v1, v1, v3 s_mov_b32 s0, 0xc2aeac50 v_cmp_gt_f32_e32 vcc, s0, v1 v_mov_b32_e32 v3, 0x42800000 v_cndmask_b32_e32 v3, 0, v3, vcc v_add_f32_e32 v1, v1, v3 v_mul_f32_e32 v1, 0x3fb8aa3b, v1 v_mov_b32_e32 v2, 0x114b4ea4 v_exp_f32_e32 v1, v1 v_cndmask_b32_e32 v2, 1.0, v2, vcc v_mul_f32_e32 v1, v2, v1 v_lshlrev_b32_e32 v2, 2, v0 ds_write_b32 v2, v1 .LBB2_8: ; %Flow240 s_or_b64 exec, exec, s[22:23] v_lshlrev_b32_e32 v7, 2, v0 s_mov_b32 s0, 0 s_cmp_lt_i32 s28, 1 v_mov_b32_e32 v2, 0 v_mov_b32_e32 v1, 0 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v3, 0 s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_cbranch_scc1 .LBB2_16 ; %bb.9: ; %.lr.ph s_lshl_b32 s4, s16, 9 s_ashr_i32 s5, s4, 31 s_lshl_b64 s[16:17], s[4:5], 2 s_add_u32 s7, s18, s16 s_addc_u32 s22, s19, s17 s_lshl_b32 s4, s27, 9 s_not_b32 s1, s20 s_ashr_i32 s5, s4, 31 s_add_i32 s20, s21, s1 s_cmp_lt_u32 s20, 7 s_cbranch_scc1 .LBB2_13 ; %bb.10: ; %.lr.ph.new v_lshlrev_b32_e32 v0, 4, v0 v_mov_b32_e32 v1, s19 v_add_co_u32_e32 v5, vcc, s18, v0 s_and_b32 s0, s28, -8 v_addc_co_u32_e32 v0, vcc, 0, v1, vcc s_lshl_b64 s[18:19], s[4:5], 5 s_mov_b32 s1, 0 s_mov_b32 s7, 0 v_mov_b32_e32 v3, 0 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v1, 0 v_mov_b32_e32 v2, 0 s_lshl_b64 s[20:21], s[4:5], 2 .LBB2_11: ; =>This Inner Loop Header: Depth=1 v_mov_b32_e32 v6, s17 v_add_co_u32_e32 v11, vcc, s16, v5 v_addc_co_u32_e32 v12, vcc, v0, v6, vcc v_mov_b32_e32 v38, s21 v_add_co_u32_e32 v20, vcc, s20, v11 v_addc_co_u32_e32 v21, vcc, v12, v38, vcc v_add_co_u32_e32 v22, vcc, s20, v20 v_addc_co_u32_e32 v23, vcc, v21, v38, vcc v_add_co_u32_e32 v28, vcc, s20, v22 v_addc_co_u32_e32 v29, vcc, v23, v38, vcc v_add_co_u32_e32 v30, vcc, s20, v28 global_load_dwordx4 v[8:11], v[11:12], off v_addc_co_u32_e32 v31, vcc, v29, v38, vcc v_add_co_u32_e32 v32, vcc, s20, v30 global_load_dwordx4 v[12:15], v[20:21], off global_load_dwordx4 v[16:19], v[22:23], off v_addc_co_u32_e32 v33, vcc, v31, v38, vcc v_add_co_u32_e32 v35, vcc, s20, v32 global_load_dwordx4 v[20:23], v[28:29], off global_load_dwordx4 v[24:27], v[30:31], off v_addc_co_u32_e32 v36, vcc, v33, v38, vcc global_load_dwordx4 v[28:31], v[32:33], off v_add_co_u32_e32 v37, vcc, s20, v35 global_load_dwordx4 v[32:35], v[35:36], off v_addc_co_u32_e32 v38, vcc, v36, v38, vcc global_load_dwordx4 v[36:39], v[37:38], off v_mov_b32_e32 v6, s1 ds_read_b128 v[40:43], v6 ds_read_b128 v[44:47], v6 offset:16 v_mov_b32_e32 v48, s19 v_add_co_u32_e32 v5, vcc, s18, v5 v_addc_co_u32_e32 v0, vcc, v0, v48, vcc s_waitcnt lgkmcnt(1) v_mov_b32_e32 v48, v40 v_mov_b32_e32 v49, v40 v_mov_b32_e32 v40, v41 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v50, v44 v_mov_b32_e32 v51, v44 v_mov_b32_e32 v44, v45 s_add_i32 s7, s7, 8 s_add_i32 s1, s1, 32 s_cmp_eq_u32 s0, s7 s_waitcnt vmcnt(7) v_pk_fma_f32 v[1:2], v[48:49], v[8:9], v[1:2] v_pk_fma_f32 v[3:4], v[10:11], v[48:49], v[3:4] s_waitcnt vmcnt(6) v_pk_fma_f32 v[1:2], v[40:41], v[12:13], v[1:2] v_pk_fma_f32 v[3:4], v[14:15], v[40:41], v[3:4] v_mov_b32_e32 v41, v42 s_waitcnt vmcnt(5) v_pk_fma_f32 v[1:2], v[41:42], v[16:17], v[1:2] v_pk_fma_f32 v[3:4], v[18:19], v[41:42], v[3:4] v_mov_b32_e32 v42, v43 s_waitcnt vmcnt(4) v_pk_fma_f32 v[1:2], v[42:43], v[20:21], v[1:2] v_pk_fma_f32 v[3:4], v[22:23], v[42:43], v[3:4] s_waitcnt vmcnt(3) v_pk_fma_f32 v[1:2], v[50:51], v[24:25], v[1:2] v_pk_fma_f32 v[3:4], v[26:27], v[50:51], v[3:4] s_waitcnt vmcnt(2) v_pk_fma_f32 v[1:2], v[44:45], v[28:29], v[1:2] v_pk_fma_f32 v[3:4], v[30:31], v[44:45], v[3:4] v_mov_b32_e32 v45, v46 s_waitcnt vmcnt(1) v_pk_fma_f32 v[1:2], v[45:46], v[32:33], v[1:2] v_pk_fma_f32 v[3:4], v[34:35], v[45:46], v[3:4] v_mov_b32_e32 v46, v47 s_waitcnt vmcnt(0) v_pk_fma_f32 v[1:2], v[46:47], v[36:37], v[1:2] v_pk_fma_f32 v[3:4], v[38:39], v[46:47], v[3:4] s_cbranch_scc0 .LBB2_11 ; %bb.12: ; %._crit_edge.loopexit.unr-lcssa.loopexit v_mov_b32_e32 v6, s17 v_add_co_u32_e32 v5, vcc, s16, v5 v_addc_co_u32_e32 v0, vcc, v0, v6, vcc s_and_b32 s7, s28, 7 s_cmp_eq_u32 s7, 0 s_cbranch_scc0 .LBB2_14 s_branch .LBB2_16 .LBB2_13: v_lshlrev_b32_e32 v0, 2, v7 s_mov_b32 s1, s0 v_mov_b32_e32 v3, s22 v_add_co_u32_e32 v5, vcc, s7, v0 v_mov_b64_e32 v[1:2], s[0:1] v_addc_co_u32_e32 v0, vcc, 0, v3, vcc v_mov_b64_e32 v[3:4], s[0:1] s_and_b32 s7, s28, 7 s_cmp_eq_u32 s7, 0 s_cbranch_scc1 .LBB2_16 .LBB2_14: ; %.epil.preheader.preheader v_add_co_u32_e32 v5, vcc, 8, v5 s_lshl_b32 s16, s0, 2 v_addc_co_u32_e32 v6, vcc, 0, v0, vcc s_lshl_b64 s[0:1], s[4:5], 2 .LBB2_15: ; %.epil.preheader ; =>This Inner Loop Header: Depth=1 global_load_dwordx4 v[8:11], v[5:6], off offset:-8 v_mov_b32_e32 v0, s16 ds_read_b32 v12, v0 v_mov_b32_e32 v13, s1 v_add_co_u32_e32 v5, vcc, s0, v5 s_add_i32 s16, s16, 4 s_add_i32 s7, s7, -1 v_addc_co_u32_e32 v6, vcc, v6, v13, vcc s_waitcnt lgkmcnt(0) v_mov_b32_e32 v13, v12 s_cmp_lg_u32 s7, 0 s_waitcnt vmcnt(0) v_pk_fma_f32 v[1:2], v[12:13], v[8:9], v[1:2] v_pk_fma_f32 v[3:4], v[10:11], v[12:13], v[3:4] s_cbranch_scc1 .LBB2_15 .LBB2_16: ; %._crit_edge v_cvt_pk_bf16_f32 v0, v1, v2 s_ashr_i32 s0, s26, 31 s_add_i32 s1, s26, s0 s_xor_b32 s1, s1, s0 v_and_b32_e32 v2, 0xffff0000, v0 v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD v_cvt_f32_u32_e32 v2, s1 s_mul_i32 s4, s8, s27 s_sub_i32 s4, s6, s4 s_ashr_i32 s5, s4, 31 v_rcp_iflag_f32_e32 v2, v2 s_add_i32 s6, s4, s5 s_xor_b32 s0, s5, s0 s_xor_b32 s5, s6, s5 v_mul_f32_e32 v2, 0x4f7ffffe, v2 v_cvt_u32_f32_e32 v2, v2 s_sub_i32 s6, 0, s1 v_cvt_pk_bf16_f32 v1, v3, v4 v_or3_b32 v0, 0, v0, 0 v_readfirstlane_b32 s7, v2 s_mul_i32 s6, s6, s7 s_mul_hi_u32 s6, s7, s6 s_add_i32 s7, s7, s6 s_mul_hi_u32 s6, s5, s7 s_mul_i32 s7, s6, s1 s_sub_i32 s5, s5, s7 s_add_i32 s7, s6, 1 s_sub_i32 s16, s5, s1 s_cmp_ge_u32 s5, s1 s_cselect_b32 s6, s7, s6 s_cselect_b32 s5, s16, s5 s_add_i32 s7, s6, 1 s_cmp_ge_u32 s5, s1 s_cselect_b32 s1, s7, s6 s_xor_b32 s1, s1, s0 s_sub_i32 s5, s1, s0 s_mul_i32 s0, s5, s26 s_sub_i32 s4, s4, s0 s_mul_i32 s0, s14, s9 s_mul_hi_u32 s1, s14, s8 s_add_i32 s0, s1, s0 s_mul_i32 s1, s15, s8 s_add_i32 s1, s0, s1 s_mul_i32 s0, s14, s8 s_lshl_b64 s[0:1], s[0:1], 1 s_add_u32 s6, s12, s0 s_addc_u32 s7, s13, s1 s_ashr_i32 s0, s5, 31 s_mul_i32 s0, s10, s0 s_mul_hi_u32 s1, s10, s5 s_add_i32 s0, s1, s0 s_mul_i32 s1, s11, s5 s_add_i32 s1, s0, s1 s_mul_i32 s0, s10, s5 s_lshl_b64 s[0:1], s[0:1], 1 s_add_u32 s5, s6, s0 s_addc_u32 s6, s7, s1 s_ashr_i32 s0, s4, 31 s_mul_i32 s0, s2, s0 s_mul_hi_u32 s1, s2, s4 s_add_i32 s0, s1, s0 s_mul_i32 s1, s3, s4 s_add_i32 s1, s0, s1 s_mul_i32 s0, s2, s4 s_lshl_b64 s[0:1], s[0:1], 1 v_and_b32_e32 v3, 0xffff, v1 v_and_b32_e32 v1, 0xffff0000, v1 s_add_u32 s0, s5, s0 v_or3_b32 v1, v3, 0, v1 s_addc_u32 s1, s6, s1 v_lshlrev_b32_e32 v2, 1, v7 global_store_dwordx2 v2, v[0:1], s[0:1] .LBB2_17: s_endpgm .LBB2_18: s_movk_i32 s0, 0x90 v_mov_b32_e32 v3, 0x4f800000 v_cmp_class_f32_e64 s[0:1], v4, s0 v_cndmask_b32_e64 v3, 1.0, v3, s[0:1] v_mul_f32_e32 v3, v4, v3 v_mov_b32_e32 v4, 0xc1b17218 v_log_f32_e32 v3, v3 v_cndmask_b32_e64 v4, 0, v4, s[0:1] v_fmac_f32_e32 v4, 0x3f317218, v3 v_add_f32_e32 v3, v4, v2 s_or_b64 exec, exec, s[24:25] v_cmp_eq_u32_e64 s[0:1], 0, v0 s_and_saveexec_b64 s[24:25], s[0:1] s_cbranch_execz .LBB2_6 .LBB2_19: s_load_dwordx2 s[0:1], s[4:5], 0x60 s_lshl_b64 s[4:5], s[6:7], 2 v_mov_b32_e32 v2, 0 s_waitcnt lgkmcnt(0) s_add_u32 s0, s0, s4 s_addc_u32 s1, s1, s5 global_store_dword v2, v3, s[0:1] s_or_b64 exec, exec, s[24:25] s_and_b64 exec, exec, vcc s_cbranch_execnz .LBB2_7 s_branch .LBB2_8 .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi32ELi128EEEv20Flash_fwd_mla_params .amdhsa_group_segment_fixed_size 128 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 336 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 52 .amdhsa_next_free_sgpr 32 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_bf16_overflow 0 .amdhsa_bf8_overflow 1 .amdhsa_fp8_overflow 1 .amdhsa_bf16_denorm_mode 3 .amdhsa_load_store_out_of_order 0 .amdhsa_matrix_excp 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi32ELi128EEEv20Flash_fwd_mla_params,#alloc,#execinstr .Lfunc_end2: .size _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi32ELi128EEEv20Flash_fwd_mla_params, .Lfunc_end2-_ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi32ELi128EEEv20Flash_fwd_mla_params ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 2296 ; NumSgprs: 36 ; NumVgprs: 52 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 128 bytes/workgroup (compile time only) ; SGPRBlocks: 4 ; VGPRBlocks: 12 ; NumSGPRsForWavesPerEU: 36 ; NumVGPRsForWavesPerEU: 52 ; Occupancy: 4 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF16_OVFL: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:FP8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:BF16_DENORM: 3 .section .text._ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi64ELi128EEEv20Flash_fwd_mla_params,#alloc,#execinstr .protected _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi64ELi128EEEv20Flash_fwd_mla_params ; -- Begin function _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi64ELi128EEEv20Flash_fwd_mla_params .globl _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi64ELi128EEEv20Flash_fwd_mla_params .p2align 8 .type _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi64ELi128EEEv20Flash_fwd_mla_params,@function _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi64ELi128EEEv20Flash_fwd_mla_params: ; @_ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi64ELi128EEEv20Flash_fwd_mla_params ; %bb.0: s_load_dword s26, s[4:5], 0x4 s_load_dword s27, s[4:5], 0x10 s_load_dwordx2 s[2:3], s[4:5], 0x120 s_ashr_i32 s7, s6, 31 s_waitcnt lgkmcnt(0) s_mul_i32 s27, s27, s26 s_ashr_i32 s0, s27, 31 s_add_i32 s1, s27, s0 s_xor_b32 s16, s1, s0 v_cvt_f32_u32_e32 v1, s16 s_xor_b32 s10, s7, s0 s_sub_i32 s0, 0, s16 s_add_i32 s1, s6, s7 v_rcp_iflag_f32_e32 v1, v1 s_xor_b32 s1, s1, s7 v_mul_f32_e32 v1, 0x4f7ffffe, v1 v_cvt_u32_f32_e32 v1, v1 v_readfirstlane_b32 s8, v1 s_mul_i32 s0, s0, s8 s_mul_hi_u32 s0, s8, s0 s_add_i32 s8, s8, s0 s_mul_hi_u32 s0, s1, s8 s_mul_i32 s8, s0, s16 s_sub_i32 s1, s1, s8 s_add_i32 s9, s0, 1 s_sub_i32 s8, s1, s16 s_cmp_ge_u32 s1, s16 s_cselect_b32 s11, s9, s0 s_cselect_b32 s17, s8, s1 s_add_i32 s12, s11, 1 s_cmp_ge_u32 s17, s16 s_cselect_b64 s[0:1], -1, 0 s_and_b64 s[8:9], s[0:1], exec s_cselect_b32 s8, s12, s11 s_xor_b32 s8, s8, s10 s_sub_i32 s8, s8, s10 s_ashr_i32 s9, s8, 31 s_lshl_b64 s[10:11], s[8:9], 2 s_add_u32 s2, s2, s10 s_addc_u32 s3, s3, s11 s_load_dwordx2 s[20:21], s[2:3], 0x0 s_waitcnt lgkmcnt(0) s_sub_i32 s28, s21, s20 s_cmp_eq_u32 s28, 1 s_cbranch_scc1 .LBB3_17 ; %bb.1: s_load_dwordx2 s[12:13], s[4:5], 0x58 s_load_dwordx2 s[14:15], s[4:5], 0x90 s_load_dwordx2 s[2:3], s[4:5], 0xc0 s_load_dwordx2 s[10:11], s[4:5], 0xf0 s_load_dwordx2 s[18:19], s[4:5], 0x130 s_sub_i32 s16, s17, s16 s_and_b64 s[0:1], s[0:1], exec s_cselect_b32 s0, s16, s17 s_xor_b32 s0, s0, s7 s_sub_i32 s0, s0, s7 s_mul_i32 s1, s20, s27 s_add_i32 s16, s0, s1 v_cmp_gt_u32_e32 vcc, 64, v0 s_and_saveexec_b64 s[22:23], vcc s_cbranch_execz .LBB3_8 ; %bb.2: v_cmp_gt_i32_e32 vcc, s28, v0 v_mov_b32_e32 v1, 0xff800000 s_and_saveexec_b64 s[24:25], vcc s_cbranch_execz .LBB3_4 ; %bb.3: s_load_dwordx2 s[0:1], s[4:5], 0x128 v_mul_lo_u32 v1, s27, v0 s_ashr_i32 s17, s16, 31 s_lshl_b64 s[30:31], s[16:17], 2 v_ashrrev_i32_e32 v2, 31, v1 s_waitcnt lgkmcnt(0) s_add_u32 s0, s0, s30 v_lshlrev_b64 v[1:2], 2, v[1:2] s_addc_u32 s1, s1, s31 v_mov_b32_e32 v3, s1 v_add_co_u32_e64 v1, s[0:1], s0, v1 v_addc_co_u32_e64 v2, s[0:1], v3, v2, s[0:1] global_load_dword v1, v[1:2], off .LBB3_4: ; %.critedge s_or_b64 exec, exec, s[24:25] v_mbcnt_lo_u32_b32 v3, -1, 0 v_mbcnt_hi_u32_b32 v3, -1, v3 v_and_b32_e32 v4, 63, v3 v_and_b32_e32 v5, 64, v3 v_add_u32_e32 v5, 64, v5 v_xor_b32_e32 v6, 32, v4 v_cmp_lt_i32_e64 s[0:1], v6, v5 v_cndmask_b32_e64 v6, v3, v6, s[0:1] s_waitcnt vmcnt(0) v_max_f32_e32 v2, 0xff800000, v1 v_lshlrev_b32_e32 v6, 2, v6 ds_bpermute_b32 v7, v6, v2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v7 v_xor_b32_e32 v7, 16, v4 v_cmp_lt_i32_e64 s[0:1], v7, v5 v_cndmask_b32_e64 v7, v3, v7, s[0:1] v_lshlrev_b32_e32 v7, 2, v7 ds_bpermute_b32 v8, v7, v2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v8 v_xor_b32_e32 v8, 8, v4 v_cmp_lt_i32_e64 s[0:1], v8, v5 v_cndmask_b32_e64 v8, v3, v8, s[0:1] v_lshlrev_b32_e32 v8, 2, v8 ds_bpermute_b32 v9, v8, v2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v9 v_xor_b32_e32 v9, 4, v4 v_cmp_lt_i32_e64 s[0:1], v9, v5 v_cndmask_b32_e64 v9, v3, v9, s[0:1] v_lshlrev_b32_e32 v9, 2, v9 ds_bpermute_b32 v10, v9, v2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v10 v_xor_b32_e32 v10, 2, v4 v_cmp_lt_i32_e64 s[0:1], v10, v5 v_cndmask_b32_e64 v10, v3, v10, s[0:1] v_lshlrev_b32_e32 v10, 2, v10 ds_bpermute_b32 v11, v10, v2 v_xor_b32_e32 v4, 1, v4 v_cmp_lt_i32_e64 s[0:1], v4, v5 v_cndmask_b32_e64 v3, v3, v4, s[0:1] v_lshlrev_b32_e32 v3, 2, v3 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v11 ds_bpermute_b32 v4, v3, v2 s_mov_b32 s0, 0xff800000 v_mov_b32_e32 v11, 0x42800000 v_mov_b32_e32 v5, 0x114b4ea4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v4 v_cmp_lg_f32_e64 s[0:1], s0, v2 v_cndmask_b32_e64 v2, 0, v2, s[0:1] v_sub_f32_e32 v4, v1, v2 s_mov_b32 s0, 0xc2aeac50 v_cmp_gt_f32_e64 s[0:1], s0, v4 v_cndmask_b32_e64 v11, 0, v11, s[0:1] v_add_f32_e32 v4, v4, v11 v_mul_f32_e32 v4, 0x3fb8aa3b, v4 v_cndmask_b32_e64 v5, 1.0, v5, s[0:1] v_exp_f32_e32 v4, v4 v_mul_f32_e32 v11, v5, v4 ds_bpermute_b32 v6, v6, v11 s_waitcnt lgkmcnt(0) v_fmac_f32_e32 v6, v5, v4 ds_bpermute_b32 v4, v7, v6 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v6, v4 ds_bpermute_b32 v5, v8, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v9, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v10, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v3, v3, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v3 v_cmp_lg_f32_e64 s[0:1], 0, v4 v_mov_b32_e32 v3, 0x7f800000 s_and_saveexec_b64 s[24:25], s[0:1] s_cbranch_execnz .LBB3_18 ; %bb.5: s_or_b64 exec, exec, s[24:25] v_cmp_eq_u32_e64 s[0:1], 0, v0 s_and_saveexec_b64 s[24:25], s[0:1] s_cbranch_execnz .LBB3_19 .LBB3_6: s_or_b64 exec, exec, s[24:25] s_and_b64 exec, exec, vcc s_cbranch_execz .LBB3_8 .LBB3_7: v_sub_f32_e32 v1, v1, v3 s_mov_b32 s0, 0xc2aeac50 v_cmp_gt_f32_e32 vcc, s0, v1 v_mov_b32_e32 v3, 0x42800000 v_cndmask_b32_e32 v3, 0, v3, vcc v_add_f32_e32 v1, v1, v3 v_mul_f32_e32 v1, 0x3fb8aa3b, v1 v_mov_b32_e32 v2, 0x114b4ea4 v_exp_f32_e32 v1, v1 v_cndmask_b32_e32 v2, 1.0, v2, vcc v_mul_f32_e32 v1, v2, v1 v_lshlrev_b32_e32 v2, 2, v0 ds_write_b32 v2, v1 .LBB3_8: ; %Flow240 s_or_b64 exec, exec, s[22:23] v_lshlrev_b32_e32 v7, 2, v0 s_mov_b32 s0, 0 s_cmp_lt_i32 s28, 1 v_mov_b32_e32 v2, 0 v_mov_b32_e32 v1, 0 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v3, 0 s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_cbranch_scc1 .LBB3_16 ; %bb.9: ; %.lr.ph s_lshl_b32 s4, s16, 9 s_ashr_i32 s5, s4, 31 s_lshl_b64 s[16:17], s[4:5], 2 s_add_u32 s7, s18, s16 s_addc_u32 s22, s19, s17 s_lshl_b32 s4, s27, 9 s_not_b32 s1, s20 s_ashr_i32 s5, s4, 31 s_add_i32 s20, s21, s1 s_cmp_lt_u32 s20, 7 s_cbranch_scc1 .LBB3_13 ; %bb.10: ; %.lr.ph.new v_lshlrev_b32_e32 v0, 4, v0 v_mov_b32_e32 v1, s19 v_add_co_u32_e32 v5, vcc, s18, v0 s_and_b32 s0, s28, -8 v_addc_co_u32_e32 v0, vcc, 0, v1, vcc s_lshl_b64 s[18:19], s[4:5], 5 s_mov_b32 s1, 0 s_mov_b32 s7, 0 v_mov_b32_e32 v3, 0 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v1, 0 v_mov_b32_e32 v2, 0 s_lshl_b64 s[20:21], s[4:5], 2 .LBB3_11: ; =>This Inner Loop Header: Depth=1 v_mov_b32_e32 v6, s17 v_add_co_u32_e32 v11, vcc, s16, v5 v_addc_co_u32_e32 v12, vcc, v0, v6, vcc v_mov_b32_e32 v38, s21 v_add_co_u32_e32 v20, vcc, s20, v11 v_addc_co_u32_e32 v21, vcc, v12, v38, vcc v_add_co_u32_e32 v22, vcc, s20, v20 v_addc_co_u32_e32 v23, vcc, v21, v38, vcc v_add_co_u32_e32 v28, vcc, s20, v22 v_addc_co_u32_e32 v29, vcc, v23, v38, vcc v_add_co_u32_e32 v30, vcc, s20, v28 global_load_dwordx4 v[8:11], v[11:12], off v_addc_co_u32_e32 v31, vcc, v29, v38, vcc v_add_co_u32_e32 v32, vcc, s20, v30 global_load_dwordx4 v[12:15], v[20:21], off global_load_dwordx4 v[16:19], v[22:23], off v_addc_co_u32_e32 v33, vcc, v31, v38, vcc v_add_co_u32_e32 v35, vcc, s20, v32 global_load_dwordx4 v[20:23], v[28:29], off global_load_dwordx4 v[24:27], v[30:31], off v_addc_co_u32_e32 v36, vcc, v33, v38, vcc global_load_dwordx4 v[28:31], v[32:33], off v_add_co_u32_e32 v37, vcc, s20, v35 global_load_dwordx4 v[32:35], v[35:36], off v_addc_co_u32_e32 v38, vcc, v36, v38, vcc global_load_dwordx4 v[36:39], v[37:38], off v_mov_b32_e32 v6, s1 ds_read_b128 v[40:43], v6 ds_read_b128 v[44:47], v6 offset:16 v_mov_b32_e32 v48, s19 v_add_co_u32_e32 v5, vcc, s18, v5 v_addc_co_u32_e32 v0, vcc, v0, v48, vcc s_waitcnt lgkmcnt(1) v_mov_b32_e32 v48, v40 v_mov_b32_e32 v49, v40 v_mov_b32_e32 v40, v41 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v50, v44 v_mov_b32_e32 v51, v44 v_mov_b32_e32 v44, v45 s_add_i32 s7, s7, 8 s_add_i32 s1, s1, 32 s_cmp_eq_u32 s0, s7 s_waitcnt vmcnt(7) v_pk_fma_f32 v[1:2], v[48:49], v[8:9], v[1:2] v_pk_fma_f32 v[3:4], v[10:11], v[48:49], v[3:4] s_waitcnt vmcnt(6) v_pk_fma_f32 v[1:2], v[40:41], v[12:13], v[1:2] v_pk_fma_f32 v[3:4], v[14:15], v[40:41], v[3:4] v_mov_b32_e32 v41, v42 s_waitcnt vmcnt(5) v_pk_fma_f32 v[1:2], v[41:42], v[16:17], v[1:2] v_pk_fma_f32 v[3:4], v[18:19], v[41:42], v[3:4] v_mov_b32_e32 v42, v43 s_waitcnt vmcnt(4) v_pk_fma_f32 v[1:2], v[42:43], v[20:21], v[1:2] v_pk_fma_f32 v[3:4], v[22:23], v[42:43], v[3:4] s_waitcnt vmcnt(3) v_pk_fma_f32 v[1:2], v[50:51], v[24:25], v[1:2] v_pk_fma_f32 v[3:4], v[26:27], v[50:51], v[3:4] s_waitcnt vmcnt(2) v_pk_fma_f32 v[1:2], v[44:45], v[28:29], v[1:2] v_pk_fma_f32 v[3:4], v[30:31], v[44:45], v[3:4] v_mov_b32_e32 v45, v46 s_waitcnt vmcnt(1) v_pk_fma_f32 v[1:2], v[45:46], v[32:33], v[1:2] v_pk_fma_f32 v[3:4], v[34:35], v[45:46], v[3:4] v_mov_b32_e32 v46, v47 s_waitcnt vmcnt(0) v_pk_fma_f32 v[1:2], v[46:47], v[36:37], v[1:2] v_pk_fma_f32 v[3:4], v[38:39], v[46:47], v[3:4] s_cbranch_scc0 .LBB3_11 ; %bb.12: ; %._crit_edge.loopexit.unr-lcssa.loopexit v_mov_b32_e32 v6, s17 v_add_co_u32_e32 v5, vcc, s16, v5 v_addc_co_u32_e32 v0, vcc, v0, v6, vcc s_and_b32 s7, s28, 7 s_cmp_eq_u32 s7, 0 s_cbranch_scc0 .LBB3_14 s_branch .LBB3_16 .LBB3_13: v_lshlrev_b32_e32 v0, 2, v7 s_mov_b32 s1, s0 v_mov_b32_e32 v3, s22 v_add_co_u32_e32 v5, vcc, s7, v0 v_mov_b64_e32 v[1:2], s[0:1] v_addc_co_u32_e32 v0, vcc, 0, v3, vcc v_mov_b64_e32 v[3:4], s[0:1] s_and_b32 s7, s28, 7 s_cmp_eq_u32 s7, 0 s_cbranch_scc1 .LBB3_16 .LBB3_14: ; %.epil.preheader.preheader v_add_co_u32_e32 v5, vcc, 8, v5 s_lshl_b32 s16, s0, 2 v_addc_co_u32_e32 v6, vcc, 0, v0, vcc s_lshl_b64 s[0:1], s[4:5], 2 .LBB3_15: ; %.epil.preheader ; =>This Inner Loop Header: Depth=1 global_load_dwordx4 v[8:11], v[5:6], off offset:-8 v_mov_b32_e32 v0, s16 ds_read_b32 v12, v0 v_mov_b32_e32 v13, s1 v_add_co_u32_e32 v5, vcc, s0, v5 s_add_i32 s16, s16, 4 s_add_i32 s7, s7, -1 v_addc_co_u32_e32 v6, vcc, v6, v13, vcc s_waitcnt lgkmcnt(0) v_mov_b32_e32 v13, v12 s_cmp_lg_u32 s7, 0 s_waitcnt vmcnt(0) v_pk_fma_f32 v[1:2], v[12:13], v[8:9], v[1:2] v_pk_fma_f32 v[3:4], v[10:11], v[12:13], v[3:4] s_cbranch_scc1 .LBB3_15 .LBB3_16: ; %._crit_edge v_cvt_pk_bf16_f32 v0, v1, v2 s_ashr_i32 s0, s26, 31 s_add_i32 s1, s26, s0 s_xor_b32 s1, s1, s0 v_and_b32_e32 v2, 0xffff0000, v0 v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD v_cvt_f32_u32_e32 v2, s1 s_mul_i32 s4, s8, s27 s_sub_i32 s4, s6, s4 s_ashr_i32 s5, s4, 31 v_rcp_iflag_f32_e32 v2, v2 s_add_i32 s6, s4, s5 s_xor_b32 s0, s5, s0 s_xor_b32 s5, s6, s5 v_mul_f32_e32 v2, 0x4f7ffffe, v2 v_cvt_u32_f32_e32 v2, v2 s_sub_i32 s6, 0, s1 v_cvt_pk_bf16_f32 v1, v3, v4 v_or3_b32 v0, 0, v0, 0 v_readfirstlane_b32 s7, v2 s_mul_i32 s6, s6, s7 s_mul_hi_u32 s6, s7, s6 s_add_i32 s7, s7, s6 s_mul_hi_u32 s6, s5, s7 s_mul_i32 s7, s6, s1 s_sub_i32 s5, s5, s7 s_add_i32 s7, s6, 1 s_sub_i32 s16, s5, s1 s_cmp_ge_u32 s5, s1 s_cselect_b32 s6, s7, s6 s_cselect_b32 s5, s16, s5 s_add_i32 s7, s6, 1 s_cmp_ge_u32 s5, s1 s_cselect_b32 s1, s7, s6 s_xor_b32 s1, s1, s0 s_sub_i32 s5, s1, s0 s_mul_i32 s0, s5, s26 s_sub_i32 s4, s4, s0 s_mul_i32 s0, s14, s9 s_mul_hi_u32 s1, s14, s8 s_add_i32 s0, s1, s0 s_mul_i32 s1, s15, s8 s_add_i32 s1, s0, s1 s_mul_i32 s0, s14, s8 s_lshl_b64 s[0:1], s[0:1], 1 s_add_u32 s6, s12, s0 s_addc_u32 s7, s13, s1 s_ashr_i32 s0, s5, 31 s_mul_i32 s0, s10, s0 s_mul_hi_u32 s1, s10, s5 s_add_i32 s0, s1, s0 s_mul_i32 s1, s11, s5 s_add_i32 s1, s0, s1 s_mul_i32 s0, s10, s5 s_lshl_b64 s[0:1], s[0:1], 1 s_add_u32 s5, s6, s0 s_addc_u32 s6, s7, s1 s_ashr_i32 s0, s4, 31 s_mul_i32 s0, s2, s0 s_mul_hi_u32 s1, s2, s4 s_add_i32 s0, s1, s0 s_mul_i32 s1, s3, s4 s_add_i32 s1, s0, s1 s_mul_i32 s0, s2, s4 s_lshl_b64 s[0:1], s[0:1], 1 v_and_b32_e32 v3, 0xffff, v1 v_and_b32_e32 v1, 0xffff0000, v1 s_add_u32 s0, s5, s0 v_or3_b32 v1, v3, 0, v1 s_addc_u32 s1, s6, s1 v_lshlrev_b32_e32 v2, 1, v7 global_store_dwordx2 v2, v[0:1], s[0:1] .LBB3_17: s_endpgm .LBB3_18: s_movk_i32 s0, 0x90 v_mov_b32_e32 v3, 0x4f800000 v_cmp_class_f32_e64 s[0:1], v4, s0 v_cndmask_b32_e64 v3, 1.0, v3, s[0:1] v_mul_f32_e32 v3, v4, v3 v_mov_b32_e32 v4, 0xc1b17218 v_log_f32_e32 v3, v3 v_cndmask_b32_e64 v4, 0, v4, s[0:1] v_fmac_f32_e32 v4, 0x3f317218, v3 v_add_f32_e32 v3, v4, v2 s_or_b64 exec, exec, s[24:25] v_cmp_eq_u32_e64 s[0:1], 0, v0 s_and_saveexec_b64 s[24:25], s[0:1] s_cbranch_execz .LBB3_6 .LBB3_19: s_load_dwordx2 s[0:1], s[4:5], 0x60 s_lshl_b64 s[4:5], s[6:7], 2 v_mov_b32_e32 v2, 0 s_waitcnt lgkmcnt(0) s_add_u32 s0, s0, s4 s_addc_u32 s1, s1, s5 global_store_dword v2, v3, s[0:1] s_or_b64 exec, exec, s[24:25] s_and_b64 exec, exec, vcc s_cbranch_execnz .LBB3_7 s_branch .LBB3_8 .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi64ELi128EEEv20Flash_fwd_mla_params .amdhsa_group_segment_fixed_size 256 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 336 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 52 .amdhsa_next_free_sgpr 32 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_bf16_overflow 0 .amdhsa_bf8_overflow 1 .amdhsa_fp8_overflow 1 .amdhsa_bf16_denorm_mode 3 .amdhsa_load_store_out_of_order 0 .amdhsa_matrix_excp 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi64ELi128EEEv20Flash_fwd_mla_params,#alloc,#execinstr .Lfunc_end3: .size _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi64ELi128EEEv20Flash_fwd_mla_params, .Lfunc_end3-_ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi64ELi128EEEv20Flash_fwd_mla_params ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 2296 ; NumSgprs: 36 ; NumVgprs: 52 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 256 bytes/workgroup (compile time only) ; SGPRBlocks: 4 ; VGPRBlocks: 12 ; NumSGPRsForWavesPerEU: 36 ; NumVGPRsForWavesPerEU: 52 ; Occupancy: 4 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF16_OVFL: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:FP8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:BF16_DENORM: 3 .section .text._ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi72ELi128EEEv20Flash_fwd_mla_params,#alloc,#execinstr .protected _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi72ELi128EEEv20Flash_fwd_mla_params ; -- Begin function _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi72ELi128EEEv20Flash_fwd_mla_params .globl _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi72ELi128EEEv20Flash_fwd_mla_params .p2align 8 .type _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi72ELi128EEEv20Flash_fwd_mla_params,@function _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi72ELi128EEEv20Flash_fwd_mla_params: ; @_ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi72ELi128EEEv20Flash_fwd_mla_params ; %bb.0: s_load_dword s28, s[4:5], 0x4 s_load_dword s29, s[4:5], 0x10 s_load_dwordx2 s[8:9], s[4:5], 0x120 s_ashr_i32 s7, s6, 31 s_waitcnt lgkmcnt(0) s_mul_i32 s29, s29, s28 s_ashr_i32 s0, s29, 31 s_add_i32 s1, s29, s0 s_xor_b32 s2, s1, s0 v_cvt_f32_u32_e32 v1, s2 s_xor_b32 s12, s7, s0 s_sub_i32 s0, 0, s2 s_add_i32 s1, s6, s7 v_rcp_iflag_f32_e32 v1, v1 s_xor_b32 s1, s1, s7 v_mul_f32_e32 v1, 0x4f7ffffe, v1 v_cvt_u32_f32_e32 v1, v1 v_readfirstlane_b32 s3, v1 s_mul_i32 s0, s0, s3 s_mul_hi_u32 s0, s3, s0 s_add_i32 s3, s3, s0 s_mul_hi_u32 s0, s1, s3 s_mul_i32 s3, s0, s2 s_sub_i32 s1, s1, s3 s_add_i32 s10, s0, 1 s_sub_i32 s3, s1, s2 s_cmp_ge_u32 s1, s2 s_cselect_b32 s13, s10, s0 s_cselect_b32 s3, s3, s1 s_add_i32 s14, s13, 1 s_cmp_ge_u32 s3, s2 s_cselect_b64 s[0:1], -1, 0 s_and_b64 s[10:11], s[0:1], exec s_cselect_b32 s10, s14, s13 s_xor_b32 s10, s10, s12 s_sub_i32 s12, s10, s12 s_ashr_i32 s13, s12, 31 s_lshl_b64 s[10:11], s[12:13], 2 s_add_u32 s8, s8, s10 s_addc_u32 s9, s9, s11 s_load_dwordx2 s[22:23], s[8:9], 0x0 s_waitcnt lgkmcnt(0) s_sub_i32 s30, s23, s22 s_cmp_eq_u32 s30, 1 s_cbranch_scc1 .LBB4_20 ; %bb.1: s_load_dwordx2 s[16:17], s[4:5], 0x58 s_load_dwordx2 s[18:19], s[4:5], 0x90 s_load_dwordx2 s[10:11], s[4:5], 0xc0 s_load_dwordx2 s[14:15], s[4:5], 0xf0 s_load_dwordx2 s[20:21], s[4:5], 0x130 s_sub_i32 s2, s3, s2 s_and_b64 s[0:1], s[0:1], exec s_cselect_b32 s0, s2, s3 s_xor_b32 s0, s0, s7 s_sub_i32 s0, s0, s7 s_mul_i32 s1, s22, s29 s_add_i32 s24, s0, s1 v_cmp_gt_u32_e32 vcc, 64, v0 s_and_saveexec_b64 s[26:27], vcc s_cbranch_execz .LBB4_11 ; %bb.2: ; %.preheader156.preheader s_load_dwordx2 s[0:1], s[4:5], 0x128 s_ashr_i32 s25, s24, 31 s_lshl_b64 s[2:3], s[24:25], 2 v_cmp_gt_i32_e32 vcc, s30, v0 v_mov_b32_e32 v1, 0xff800000 s_waitcnt lgkmcnt(0) s_add_u32 s25, s0, s2 s_addc_u32 s31, s1, s3 v_mov_b32_e32 v2, 0xff800000 s_and_saveexec_b64 s[2:3], vcc s_cbranch_execz .LBB4_4 ; %bb.3: v_mul_lo_u32 v2, v0, s29 v_mov_b32_e32 v4, s31 v_ashrrev_i32_e32 v3, 31, v2 v_lshlrev_b64 v[2:3], 2, v[2:3] v_add_co_u32_e64 v2, s[0:1], s25, v2 v_addc_co_u32_e64 v3, s[0:1], v4, v3, s[0:1] global_load_dword v2, v[2:3], off .LBB4_4: ; %.preheader156.1 s_or_b64 exec, exec, s[2:3] v_add_u32_e32 v3, 64, v0 v_cmp_gt_i32_e64 s[0:1], s30, v3 s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB4_6 ; %bb.5: v_mul_lo_u32 v3, v3, s29 v_mov_b32_e32 v1, s31 v_ashrrev_i32_e32 v4, 31, v3 v_lshlrev_b64 v[3:4], 2, v[3:4] v_add_co_u32_e64 v3, s[2:3], s25, v3 v_addc_co_u32_e64 v4, s[2:3], v1, v4, s[2:3] global_load_dword v1, v[3:4], off .LBB4_6: ; %.preheader155 s_or_b64 exec, exec, s[8:9] v_mbcnt_lo_u32_b32 v4, -1, 0 v_mbcnt_hi_u32_b32 v4, -1, v4 v_and_b32_e32 v5, 63, v4 v_and_b32_e32 v6, 64, v4 v_add_u32_e32 v6, 64, v6 v_xor_b32_e32 v7, 32, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 s_mov_b32 s8, 0xff800000 v_cndmask_b32_e64 v7, v4, v7, s[2:3] s_waitcnt vmcnt(0) v_max3_f32 v3, v2, s8, v1 v_lshlrev_b32_e32 v10, 2, v7 ds_bpermute_b32 v7, v10, v3 v_mov_b32_e32 v8, 0x42800000 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 v_xor_b32_e32 v7, 16, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 v_cndmask_b32_e64 v7, v4, v7, s[2:3] v_lshlrev_b32_e32 v11, 2, v7 ds_bpermute_b32 v7, v11, v3 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 v_xor_b32_e32 v7, 8, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 v_cndmask_b32_e64 v7, v4, v7, s[2:3] v_lshlrev_b32_e32 v12, 2, v7 ds_bpermute_b32 v7, v12, v3 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 v_xor_b32_e32 v7, 4, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 v_cndmask_b32_e64 v7, v4, v7, s[2:3] v_lshlrev_b32_e32 v13, 2, v7 ds_bpermute_b32 v7, v13, v3 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 v_xor_b32_e32 v7, 2, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 v_cndmask_b32_e64 v7, v4, v7, s[2:3] v_lshlrev_b32_e32 v14, 2, v7 ds_bpermute_b32 v7, v14, v3 v_xor_b32_e32 v5, 1, v5 v_cmp_lt_i32_e64 s[2:3], v5, v6 v_cndmask_b32_e64 v4, v4, v5, s[2:3] v_lshlrev_b32_e32 v15, 2, v4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 ds_bpermute_b32 v4, v15, v3 v_mov_b32_e32 v6, 0x114b4ea4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v4 v_cmp_lg_f32_e64 s[2:3], s8, v3 v_cndmask_b32_e64 v3, 0, v3, s[2:3] v_sub_f32_e32 v5, v2, v3 v_sub_f32_e32 v4, v1, v3 s_mov_b32 s8, 0xc2aeac50 v_cmp_gt_f32_e64 s[2:3], s8, v5 v_cmp_gt_f32_e64 s[8:9], s8, v4 v_cndmask_b32_e64 v9, 0, v8, s[2:3] v_cndmask_b32_e64 v8, 0, v8, s[8:9] v_pk_add_f32 v[4:5], v[4:5], v[8:9] v_mov_b32_e32 v8, 0x3fb8aa3b v_mov_b32_e32 v9, v8 v_pk_mul_f32 v[4:5], v[4:5], v[8:9] v_cndmask_b32_e64 v7, 1.0, v6, s[2:3] v_exp_f32_e32 v5, v5 v_exp_f32_e32 v4, v4 v_cndmask_b32_e64 v6, 1.0, v6, s[8:9] v_pk_mul_f32 v[4:5], v[6:7], v[4:5] v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v10, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v11, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v12, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v13, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v14, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v15, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v5, v4, v5 v_cmp_lg_f32_e64 s[2:3], 0, v5 v_mov_b32_e32 v4, 0x7f800000 s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execnz .LBB4_21 ; %bb.7: s_or_b64 exec, exec, s[8:9] v_cmp_eq_u32_e64 s[2:3], 0, v0 s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execnz .LBB4_22 .LBB4_8: s_or_b64 exec, exec, s[8:9] s_and_saveexec_b64 s[2:3], vcc s_cbranch_execnz .LBB4_23 .LBB4_9: s_or_b64 exec, exec, s[2:3] s_and_b64 exec, exec, s[0:1] s_cbranch_execz .LBB4_11 .LBB4_10: v_sub_f32_e32 v1, v1, v4 s_mov_b32 s0, 0xc2aeac50 v_cmp_gt_f32_e32 vcc, s0, v1 v_mov_b32_e32 v3, 0x42800000 v_cndmask_b32_e32 v3, 0, v3, vcc v_add_f32_e32 v1, v1, v3 v_mul_f32_e32 v1, 0x3fb8aa3b, v1 v_mov_b32_e32 v2, 0x114b4ea4 v_exp_f32_e32 v1, v1 v_cndmask_b32_e32 v2, 1.0, v2, vcc v_mul_f32_e32 v1, v2, v1 v_lshlrev_b32_e32 v2, 2, v0 ds_write_b32 v2, v1 offset:256 .LBB4_11: ; %Flow245 s_or_b64 exec, exec, s[26:27] v_lshlrev_b32_e32 v7, 2, v0 s_mov_b32 s0, 0 s_cmp_lt_i32 s30, 1 v_mov_b32_e32 v2, 0 v_mov_b32_e32 v1, 0 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v3, 0 s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_cbranch_scc1 .LBB4_19 ; %bb.12: ; %.lr.ph s_lshl_b32 s2, s24, 9 s_ashr_i32 s3, s2, 31 s_lshl_b64 s[4:5], s[2:3], 2 s_add_u32 s7, s20, s4 s_addc_u32 s8, s21, s5 s_lshl_b32 s2, s29, 9 s_not_b32 s1, s22 s_ashr_i32 s3, s2, 31 s_add_i32 s9, s23, s1 s_cmp_lt_u32 s9, 7 s_cbranch_scc1 .LBB4_16 ; %bb.13: ; %.lr.ph.new v_lshlrev_b32_e32 v0, 4, v0 v_mov_b32_e32 v1, s21 v_add_co_u32_e32 v5, vcc, s20, v0 s_and_b32 s0, s30, -8 v_addc_co_u32_e32 v0, vcc, 0, v1, vcc s_lshl_b64 s[8:9], s[2:3], 5 s_mov_b32 s1, 0 s_mov_b32 s7, 0 v_mov_b32_e32 v3, 0 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v1, 0 v_mov_b32_e32 v2, 0 s_lshl_b64 s[20:21], s[2:3], 2 .LBB4_14: ; =>This Inner Loop Header: Depth=1 v_mov_b32_e32 v6, s5 v_add_co_u32_e32 v11, vcc, s4, v5 v_addc_co_u32_e32 v12, vcc, v0, v6, vcc v_mov_b32_e32 v38, s21 v_add_co_u32_e32 v20, vcc, s20, v11 v_addc_co_u32_e32 v21, vcc, v12, v38, vcc v_add_co_u32_e32 v22, vcc, s20, v20 v_addc_co_u32_e32 v23, vcc, v21, v38, vcc v_add_co_u32_e32 v28, vcc, s20, v22 v_addc_co_u32_e32 v29, vcc, v23, v38, vcc v_add_co_u32_e32 v30, vcc, s20, v28 global_load_dwordx4 v[8:11], v[11:12], off v_addc_co_u32_e32 v31, vcc, v29, v38, vcc v_add_co_u32_e32 v32, vcc, s20, v30 global_load_dwordx4 v[12:15], v[20:21], off global_load_dwordx4 v[16:19], v[22:23], off v_addc_co_u32_e32 v33, vcc, v31, v38, vcc v_add_co_u32_e32 v35, vcc, s20, v32 global_load_dwordx4 v[20:23], v[28:29], off global_load_dwordx4 v[24:27], v[30:31], off v_addc_co_u32_e32 v36, vcc, v33, v38, vcc global_load_dwordx4 v[28:31], v[32:33], off v_add_co_u32_e32 v37, vcc, s20, v35 global_load_dwordx4 v[32:35], v[35:36], off v_addc_co_u32_e32 v38, vcc, v36, v38, vcc global_load_dwordx4 v[36:39], v[37:38], off v_mov_b32_e32 v6, s1 ds_read_b128 v[40:43], v6 ds_read_b128 v[44:47], v6 offset:16 v_mov_b32_e32 v48, s9 v_add_co_u32_e32 v5, vcc, s8, v5 v_addc_co_u32_e32 v0, vcc, v0, v48, vcc s_waitcnt lgkmcnt(1) v_mov_b32_e32 v48, v40 v_mov_b32_e32 v49, v40 v_mov_b32_e32 v40, v41 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v50, v44 v_mov_b32_e32 v51, v44 v_mov_b32_e32 v44, v45 s_add_i32 s7, s7, 8 s_add_i32 s1, s1, 32 s_cmp_eq_u32 s0, s7 s_waitcnt vmcnt(7) v_pk_fma_f32 v[1:2], v[48:49], v[8:9], v[1:2] v_pk_fma_f32 v[3:4], v[10:11], v[48:49], v[3:4] s_waitcnt vmcnt(6) v_pk_fma_f32 v[1:2], v[40:41], v[12:13], v[1:2] v_pk_fma_f32 v[3:4], v[14:15], v[40:41], v[3:4] v_mov_b32_e32 v41, v42 s_waitcnt vmcnt(5) v_pk_fma_f32 v[1:2], v[41:42], v[16:17], v[1:2] v_pk_fma_f32 v[3:4], v[18:19], v[41:42], v[3:4] v_mov_b32_e32 v42, v43 s_waitcnt vmcnt(4) v_pk_fma_f32 v[1:2], v[42:43], v[20:21], v[1:2] v_pk_fma_f32 v[3:4], v[22:23], v[42:43], v[3:4] s_waitcnt vmcnt(3) v_pk_fma_f32 v[1:2], v[50:51], v[24:25], v[1:2] v_pk_fma_f32 v[3:4], v[26:27], v[50:51], v[3:4] s_waitcnt vmcnt(2) v_pk_fma_f32 v[1:2], v[44:45], v[28:29], v[1:2] v_pk_fma_f32 v[3:4], v[30:31], v[44:45], v[3:4] v_mov_b32_e32 v45, v46 s_waitcnt vmcnt(1) v_pk_fma_f32 v[1:2], v[45:46], v[32:33], v[1:2] v_pk_fma_f32 v[3:4], v[34:35], v[45:46], v[3:4] v_mov_b32_e32 v46, v47 s_waitcnt vmcnt(0) v_pk_fma_f32 v[1:2], v[46:47], v[36:37], v[1:2] v_pk_fma_f32 v[3:4], v[38:39], v[46:47], v[3:4] s_cbranch_scc0 .LBB4_14 ; %bb.15: ; %._crit_edge.loopexit.unr-lcssa.loopexit v_mov_b32_e32 v6, s5 v_add_co_u32_e32 v5, vcc, s4, v5 v_addc_co_u32_e32 v0, vcc, v0, v6, vcc s_and_b32 s4, s30, 7 s_cmp_eq_u32 s4, 0 s_cbranch_scc0 .LBB4_17 s_branch .LBB4_19 .LBB4_16: v_lshlrev_b32_e32 v0, 2, v7 s_mov_b32 s1, s0 v_mov_b32_e32 v3, s8 v_add_co_u32_e32 v5, vcc, s7, v0 v_mov_b64_e32 v[1:2], s[0:1] v_addc_co_u32_e32 v0, vcc, 0, v3, vcc v_mov_b64_e32 v[3:4], s[0:1] s_and_b32 s4, s30, 7 s_cmp_eq_u32 s4, 0 s_cbranch_scc1 .LBB4_19 .LBB4_17: ; %.epil.preheader.preheader v_add_co_u32_e32 v5, vcc, 8, v5 s_lshl_b32 s5, s0, 2 v_addc_co_u32_e32 v6, vcc, 0, v0, vcc s_lshl_b64 s[0:1], s[2:3], 2 .LBB4_18: ; %.epil.preheader ; =>This Inner Loop Header: Depth=1 global_load_dwordx4 v[8:11], v[5:6], off offset:-8 v_mov_b32_e32 v0, s5 ds_read_b32 v12, v0 v_mov_b32_e32 v13, s1 v_add_co_u32_e32 v5, vcc, s0, v5 s_add_i32 s5, s5, 4 s_add_i32 s4, s4, -1 v_addc_co_u32_e32 v6, vcc, v6, v13, vcc s_waitcnt lgkmcnt(0) v_mov_b32_e32 v13, v12 s_cmp_lg_u32 s4, 0 s_waitcnt vmcnt(0) v_pk_fma_f32 v[1:2], v[12:13], v[8:9], v[1:2] v_pk_fma_f32 v[3:4], v[10:11], v[12:13], v[3:4] s_cbranch_scc1 .LBB4_18 .LBB4_19: ; %._crit_edge v_cvt_pk_bf16_f32 v0, v1, v2 s_ashr_i32 s0, s28, 31 s_add_i32 s1, s28, s0 s_xor_b32 s1, s1, s0 v_and_b32_e32 v2, 0xffff0000, v0 v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD v_cvt_f32_u32_e32 v2, s1 s_mul_i32 s2, s12, s29 s_sub_i32 s2, s6, s2 s_ashr_i32 s3, s2, 31 v_rcp_iflag_f32_e32 v2, v2 s_add_i32 s4, s2, s3 s_xor_b32 s0, s3, s0 s_xor_b32 s3, s4, s3 v_mul_f32_e32 v2, 0x4f7ffffe, v2 v_cvt_u32_f32_e32 v2, v2 s_sub_i32 s4, 0, s1 v_cvt_pk_bf16_f32 v1, v3, v4 v_or3_b32 v0, 0, v0, 0 v_readfirstlane_b32 s5, v2 s_mul_i32 s4, s4, s5 s_mul_hi_u32 s4, s5, s4 s_add_i32 s5, s5, s4 s_mul_hi_u32 s4, s3, s5 s_mul_i32 s5, s4, s1 s_sub_i32 s3, s3, s5 s_add_i32 s5, s4, 1 s_sub_i32 s6, s3, s1 s_cmp_ge_u32 s3, s1 s_cselect_b32 s4, s5, s4 s_cselect_b32 s3, s6, s3 s_add_i32 s5, s4, 1 s_cmp_ge_u32 s3, s1 s_cselect_b32 s1, s5, s4 s_xor_b32 s1, s1, s0 s_sub_i32 s3, s1, s0 s_mul_i32 s0, s3, s28 s_sub_i32 s2, s2, s0 s_mul_i32 s0, s18, s13 s_mul_hi_u32 s1, s18, s12 s_add_i32 s0, s1, s0 s_mul_i32 s1, s19, s12 s_add_i32 s1, s0, s1 s_mul_i32 s0, s18, s12 s_lshl_b64 s[0:1], s[0:1], 1 s_add_u32 s4, s16, s0 s_addc_u32 s5, s17, s1 s_ashr_i32 s0, s3, 31 s_mul_i32 s0, s14, s0 s_mul_hi_u32 s1, s14, s3 s_add_i32 s0, s1, s0 s_mul_i32 s1, s15, s3 s_add_i32 s1, s0, s1 s_mul_i32 s0, s14, s3 s_lshl_b64 s[0:1], s[0:1], 1 s_add_u32 s3, s4, s0 s_addc_u32 s4, s5, s1 s_ashr_i32 s0, s2, 31 s_mul_i32 s0, s10, s0 s_mul_hi_u32 s1, s10, s2 s_add_i32 s0, s1, s0 s_mul_i32 s1, s11, s2 s_add_i32 s1, s0, s1 s_mul_i32 s0, s10, s2 s_lshl_b64 s[0:1], s[0:1], 1 v_and_b32_e32 v3, 0xffff, v1 v_and_b32_e32 v1, 0xffff0000, v1 s_add_u32 s0, s3, s0 v_or3_b32 v1, v3, 0, v1 s_addc_u32 s1, s4, s1 v_lshlrev_b32_e32 v2, 1, v7 global_store_dwordx2 v2, v[0:1], s[0:1] .LBB4_20: s_endpgm .LBB4_21: s_movk_i32 s2, 0x90 v_mov_b32_e32 v4, 0x4f800000 v_cmp_class_f32_e64 s[2:3], v5, s2 v_cndmask_b32_e64 v4, 1.0, v4, s[2:3] v_mul_f32_e32 v4, v5, v4 v_mov_b32_e32 v5, 0xc1b17218 v_log_f32_e32 v4, v4 v_cndmask_b32_e64 v5, 0, v5, s[2:3] v_fmac_f32_e32 v5, 0x3f317218, v4 v_add_f32_e32 v4, v5, v3 s_or_b64 exec, exec, s[8:9] v_cmp_eq_u32_e64 s[2:3], 0, v0 s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execz .LBB4_8 .LBB4_22: s_load_dwordx2 s[2:3], s[4:5], 0x60 s_lshl_b64 s[4:5], s[6:7], 2 v_mov_b32_e32 v3, 0 s_waitcnt lgkmcnt(0) s_add_u32 s2, s2, s4 s_addc_u32 s3, s3, s5 global_store_dword v3, v4, s[2:3] s_or_b64 exec, exec, s[8:9] s_and_saveexec_b64 s[2:3], vcc s_cbranch_execz .LBB4_9 .LBB4_23: v_sub_f32_e32 v2, v2, v4 s_mov_b32 s4, 0xc2aeac50 v_cmp_gt_f32_e32 vcc, s4, v2 v_mov_b32_e32 v5, 0x42800000 v_cndmask_b32_e32 v5, 0, v5, vcc v_add_f32_e32 v2, v2, v5 v_mul_f32_e32 v2, 0x3fb8aa3b, v2 v_mov_b32_e32 v3, 0x114b4ea4 v_exp_f32_e32 v2, v2 v_cndmask_b32_e32 v3, 1.0, v3, vcc v_mul_f32_e32 v2, v3, v2 v_lshlrev_b32_e32 v3, 2, v0 ds_write_b32 v3, v2 s_or_b64 exec, exec, s[2:3] s_and_b64 exec, exec, s[0:1] s_cbranch_execnz .LBB4_10 s_branch .LBB4_11 .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi72ELi128EEEv20Flash_fwd_mla_params .amdhsa_group_segment_fixed_size 288 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 336 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 52 .amdhsa_next_free_sgpr 32 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_bf16_overflow 0 .amdhsa_bf8_overflow 1 .amdhsa_fp8_overflow 1 .amdhsa_bf16_denorm_mode 3 .amdhsa_load_store_out_of_order 0 .amdhsa_matrix_excp 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi72ELi128EEEv20Flash_fwd_mla_params,#alloc,#execinstr .Lfunc_end4: .size _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi72ELi128EEEv20Flash_fwd_mla_params, .Lfunc_end4-_ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi72ELi128EEEv20Flash_fwd_mla_params ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 2564 ; NumSgprs: 36 ; NumVgprs: 52 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 288 bytes/workgroup (compile time only) ; SGPRBlocks: 4 ; VGPRBlocks: 12 ; NumSGPRsForWavesPerEU: 36 ; NumVGPRsForWavesPerEU: 52 ; Occupancy: 4 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF16_OVFL: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:FP8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:BF16_DENORM: 3 .section .text._ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi96ELi128EEEv20Flash_fwd_mla_params,#alloc,#execinstr .protected _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi96ELi128EEEv20Flash_fwd_mla_params ; -- Begin function _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi96ELi128EEEv20Flash_fwd_mla_params .globl _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi96ELi128EEEv20Flash_fwd_mla_params .p2align 8 .type _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi96ELi128EEEv20Flash_fwd_mla_params,@function _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi96ELi128EEEv20Flash_fwd_mla_params: ; @_ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi96ELi128EEEv20Flash_fwd_mla_params ; %bb.0: s_load_dword s28, s[4:5], 0x4 s_load_dword s29, s[4:5], 0x10 s_load_dwordx2 s[8:9], s[4:5], 0x120 s_ashr_i32 s7, s6, 31 s_waitcnt lgkmcnt(0) s_mul_i32 s29, s29, s28 s_ashr_i32 s0, s29, 31 s_add_i32 s1, s29, s0 s_xor_b32 s2, s1, s0 v_cvt_f32_u32_e32 v1, s2 s_xor_b32 s12, s7, s0 s_sub_i32 s0, 0, s2 s_add_i32 s1, s6, s7 v_rcp_iflag_f32_e32 v1, v1 s_xor_b32 s1, s1, s7 v_mul_f32_e32 v1, 0x4f7ffffe, v1 v_cvt_u32_f32_e32 v1, v1 v_readfirstlane_b32 s3, v1 s_mul_i32 s0, s0, s3 s_mul_hi_u32 s0, s3, s0 s_add_i32 s3, s3, s0 s_mul_hi_u32 s0, s1, s3 s_mul_i32 s3, s0, s2 s_sub_i32 s1, s1, s3 s_add_i32 s10, s0, 1 s_sub_i32 s3, s1, s2 s_cmp_ge_u32 s1, s2 s_cselect_b32 s13, s10, s0 s_cselect_b32 s3, s3, s1 s_add_i32 s14, s13, 1 s_cmp_ge_u32 s3, s2 s_cselect_b64 s[0:1], -1, 0 s_and_b64 s[10:11], s[0:1], exec s_cselect_b32 s10, s14, s13 s_xor_b32 s10, s10, s12 s_sub_i32 s12, s10, s12 s_ashr_i32 s13, s12, 31 s_lshl_b64 s[10:11], s[12:13], 2 s_add_u32 s8, s8, s10 s_addc_u32 s9, s9, s11 s_load_dwordx2 s[22:23], s[8:9], 0x0 s_waitcnt lgkmcnt(0) s_sub_i32 s30, s23, s22 s_cmp_eq_u32 s30, 1 s_cbranch_scc1 .LBB5_20 ; %bb.1: s_load_dwordx2 s[16:17], s[4:5], 0x58 s_load_dwordx2 s[18:19], s[4:5], 0x90 s_load_dwordx2 s[10:11], s[4:5], 0xc0 s_load_dwordx2 s[14:15], s[4:5], 0xf0 s_load_dwordx2 s[20:21], s[4:5], 0x130 s_sub_i32 s2, s3, s2 s_and_b64 s[0:1], s[0:1], exec s_cselect_b32 s0, s2, s3 s_xor_b32 s0, s0, s7 s_sub_i32 s0, s0, s7 s_mul_i32 s1, s22, s29 s_add_i32 s24, s0, s1 v_cmp_gt_u32_e32 vcc, 64, v0 s_and_saveexec_b64 s[26:27], vcc s_cbranch_execz .LBB5_11 ; %bb.2: ; %.preheader156.preheader s_load_dwordx2 s[0:1], s[4:5], 0x128 s_ashr_i32 s25, s24, 31 s_lshl_b64 s[2:3], s[24:25], 2 v_cmp_gt_i32_e32 vcc, s30, v0 v_mov_b32_e32 v1, 0xff800000 s_waitcnt lgkmcnt(0) s_add_u32 s25, s0, s2 s_addc_u32 s31, s1, s3 v_mov_b32_e32 v2, 0xff800000 s_and_saveexec_b64 s[2:3], vcc s_cbranch_execz .LBB5_4 ; %bb.3: v_mul_lo_u32 v2, v0, s29 v_mov_b32_e32 v4, s31 v_ashrrev_i32_e32 v3, 31, v2 v_lshlrev_b64 v[2:3], 2, v[2:3] v_add_co_u32_e64 v2, s[0:1], s25, v2 v_addc_co_u32_e64 v3, s[0:1], v4, v3, s[0:1] global_load_dword v2, v[2:3], off .LBB5_4: ; %.preheader156.1 s_or_b64 exec, exec, s[2:3] v_add_u32_e32 v3, 64, v0 v_cmp_gt_i32_e64 s[0:1], s30, v3 s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB5_6 ; %bb.5: v_mul_lo_u32 v3, v3, s29 v_mov_b32_e32 v1, s31 v_ashrrev_i32_e32 v4, 31, v3 v_lshlrev_b64 v[3:4], 2, v[3:4] v_add_co_u32_e64 v3, s[2:3], s25, v3 v_addc_co_u32_e64 v4, s[2:3], v1, v4, s[2:3] global_load_dword v1, v[3:4], off .LBB5_6: ; %.preheader155 s_or_b64 exec, exec, s[8:9] v_mbcnt_lo_u32_b32 v4, -1, 0 v_mbcnt_hi_u32_b32 v4, -1, v4 v_and_b32_e32 v5, 63, v4 v_and_b32_e32 v6, 64, v4 v_add_u32_e32 v6, 64, v6 v_xor_b32_e32 v7, 32, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 s_mov_b32 s8, 0xff800000 v_cndmask_b32_e64 v7, v4, v7, s[2:3] s_waitcnt vmcnt(0) v_max3_f32 v3, v2, s8, v1 v_lshlrev_b32_e32 v10, 2, v7 ds_bpermute_b32 v7, v10, v3 v_mov_b32_e32 v8, 0x42800000 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 v_xor_b32_e32 v7, 16, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 v_cndmask_b32_e64 v7, v4, v7, s[2:3] v_lshlrev_b32_e32 v11, 2, v7 ds_bpermute_b32 v7, v11, v3 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 v_xor_b32_e32 v7, 8, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 v_cndmask_b32_e64 v7, v4, v7, s[2:3] v_lshlrev_b32_e32 v12, 2, v7 ds_bpermute_b32 v7, v12, v3 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 v_xor_b32_e32 v7, 4, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 v_cndmask_b32_e64 v7, v4, v7, s[2:3] v_lshlrev_b32_e32 v13, 2, v7 ds_bpermute_b32 v7, v13, v3 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 v_xor_b32_e32 v7, 2, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 v_cndmask_b32_e64 v7, v4, v7, s[2:3] v_lshlrev_b32_e32 v14, 2, v7 ds_bpermute_b32 v7, v14, v3 v_xor_b32_e32 v5, 1, v5 v_cmp_lt_i32_e64 s[2:3], v5, v6 v_cndmask_b32_e64 v4, v4, v5, s[2:3] v_lshlrev_b32_e32 v15, 2, v4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 ds_bpermute_b32 v4, v15, v3 v_mov_b32_e32 v6, 0x114b4ea4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v4 v_cmp_lg_f32_e64 s[2:3], s8, v3 v_cndmask_b32_e64 v3, 0, v3, s[2:3] v_sub_f32_e32 v5, v2, v3 v_sub_f32_e32 v4, v1, v3 s_mov_b32 s8, 0xc2aeac50 v_cmp_gt_f32_e64 s[2:3], s8, v5 v_cmp_gt_f32_e64 s[8:9], s8, v4 v_cndmask_b32_e64 v9, 0, v8, s[2:3] v_cndmask_b32_e64 v8, 0, v8, s[8:9] v_pk_add_f32 v[4:5], v[4:5], v[8:9] v_mov_b32_e32 v8, 0x3fb8aa3b v_mov_b32_e32 v9, v8 v_pk_mul_f32 v[4:5], v[4:5], v[8:9] v_cndmask_b32_e64 v7, 1.0, v6, s[2:3] v_exp_f32_e32 v5, v5 v_exp_f32_e32 v4, v4 v_cndmask_b32_e64 v6, 1.0, v6, s[8:9] v_pk_mul_f32 v[4:5], v[6:7], v[4:5] v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v10, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v11, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v12, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v13, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v14, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v15, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v5, v4, v5 v_cmp_lg_f32_e64 s[2:3], 0, v5 v_mov_b32_e32 v4, 0x7f800000 s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execnz .LBB5_21 ; %bb.7: s_or_b64 exec, exec, s[8:9] v_cmp_eq_u32_e64 s[2:3], 0, v0 s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execnz .LBB5_22 .LBB5_8: s_or_b64 exec, exec, s[8:9] s_and_saveexec_b64 s[2:3], vcc s_cbranch_execnz .LBB5_23 .LBB5_9: s_or_b64 exec, exec, s[2:3] s_and_b64 exec, exec, s[0:1] s_cbranch_execz .LBB5_11 .LBB5_10: v_sub_f32_e32 v1, v1, v4 s_mov_b32 s0, 0xc2aeac50 v_cmp_gt_f32_e32 vcc, s0, v1 v_mov_b32_e32 v3, 0x42800000 v_cndmask_b32_e32 v3, 0, v3, vcc v_add_f32_e32 v1, v1, v3 v_mul_f32_e32 v1, 0x3fb8aa3b, v1 v_mov_b32_e32 v2, 0x114b4ea4 v_exp_f32_e32 v1, v1 v_cndmask_b32_e32 v2, 1.0, v2, vcc v_mul_f32_e32 v1, v2, v1 v_lshlrev_b32_e32 v2, 2, v0 ds_write_b32 v2, v1 offset:256 .LBB5_11: ; %Flow245 s_or_b64 exec, exec, s[26:27] v_lshlrev_b32_e32 v7, 2, v0 s_mov_b32 s0, 0 s_cmp_lt_i32 s30, 1 v_mov_b32_e32 v2, 0 v_mov_b32_e32 v1, 0 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v3, 0 s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_cbranch_scc1 .LBB5_19 ; %bb.12: ; %.lr.ph s_lshl_b32 s2, s24, 9 s_ashr_i32 s3, s2, 31 s_lshl_b64 s[4:5], s[2:3], 2 s_add_u32 s7, s20, s4 s_addc_u32 s8, s21, s5 s_lshl_b32 s2, s29, 9 s_not_b32 s1, s22 s_ashr_i32 s3, s2, 31 s_add_i32 s9, s23, s1 s_cmp_lt_u32 s9, 7 s_cbranch_scc1 .LBB5_16 ; %bb.13: ; %.lr.ph.new v_lshlrev_b32_e32 v0, 4, v0 v_mov_b32_e32 v1, s21 v_add_co_u32_e32 v5, vcc, s20, v0 s_and_b32 s0, s30, -8 v_addc_co_u32_e32 v0, vcc, 0, v1, vcc s_lshl_b64 s[8:9], s[2:3], 5 s_mov_b32 s1, 0 s_mov_b32 s7, 0 v_mov_b32_e32 v3, 0 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v1, 0 v_mov_b32_e32 v2, 0 s_lshl_b64 s[20:21], s[2:3], 2 .LBB5_14: ; =>This Inner Loop Header: Depth=1 v_mov_b32_e32 v6, s5 v_add_co_u32_e32 v11, vcc, s4, v5 v_addc_co_u32_e32 v12, vcc, v0, v6, vcc v_mov_b32_e32 v38, s21 v_add_co_u32_e32 v20, vcc, s20, v11 v_addc_co_u32_e32 v21, vcc, v12, v38, vcc v_add_co_u32_e32 v22, vcc, s20, v20 v_addc_co_u32_e32 v23, vcc, v21, v38, vcc v_add_co_u32_e32 v28, vcc, s20, v22 v_addc_co_u32_e32 v29, vcc, v23, v38, vcc v_add_co_u32_e32 v30, vcc, s20, v28 global_load_dwordx4 v[8:11], v[11:12], off v_addc_co_u32_e32 v31, vcc, v29, v38, vcc v_add_co_u32_e32 v32, vcc, s20, v30 global_load_dwordx4 v[12:15], v[20:21], off global_load_dwordx4 v[16:19], v[22:23], off v_addc_co_u32_e32 v33, vcc, v31, v38, vcc v_add_co_u32_e32 v35, vcc, s20, v32 global_load_dwordx4 v[20:23], v[28:29], off global_load_dwordx4 v[24:27], v[30:31], off v_addc_co_u32_e32 v36, vcc, v33, v38, vcc global_load_dwordx4 v[28:31], v[32:33], off v_add_co_u32_e32 v37, vcc, s20, v35 global_load_dwordx4 v[32:35], v[35:36], off v_addc_co_u32_e32 v38, vcc, v36, v38, vcc global_load_dwordx4 v[36:39], v[37:38], off v_mov_b32_e32 v6, s1 ds_read_b128 v[40:43], v6 ds_read_b128 v[44:47], v6 offset:16 v_mov_b32_e32 v48, s9 v_add_co_u32_e32 v5, vcc, s8, v5 v_addc_co_u32_e32 v0, vcc, v0, v48, vcc s_waitcnt lgkmcnt(1) v_mov_b32_e32 v48, v40 v_mov_b32_e32 v49, v40 v_mov_b32_e32 v40, v41 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v50, v44 v_mov_b32_e32 v51, v44 v_mov_b32_e32 v44, v45 s_add_i32 s7, s7, 8 s_add_i32 s1, s1, 32 s_cmp_eq_u32 s0, s7 s_waitcnt vmcnt(7) v_pk_fma_f32 v[1:2], v[48:49], v[8:9], v[1:2] v_pk_fma_f32 v[3:4], v[10:11], v[48:49], v[3:4] s_waitcnt vmcnt(6) v_pk_fma_f32 v[1:2], v[40:41], v[12:13], v[1:2] v_pk_fma_f32 v[3:4], v[14:15], v[40:41], v[3:4] v_mov_b32_e32 v41, v42 s_waitcnt vmcnt(5) v_pk_fma_f32 v[1:2], v[41:42], v[16:17], v[1:2] v_pk_fma_f32 v[3:4], v[18:19], v[41:42], v[3:4] v_mov_b32_e32 v42, v43 s_waitcnt vmcnt(4) v_pk_fma_f32 v[1:2], v[42:43], v[20:21], v[1:2] v_pk_fma_f32 v[3:4], v[22:23], v[42:43], v[3:4] s_waitcnt vmcnt(3) v_pk_fma_f32 v[1:2], v[50:51], v[24:25], v[1:2] v_pk_fma_f32 v[3:4], v[26:27], v[50:51], v[3:4] s_waitcnt vmcnt(2) v_pk_fma_f32 v[1:2], v[44:45], v[28:29], v[1:2] v_pk_fma_f32 v[3:4], v[30:31], v[44:45], v[3:4] v_mov_b32_e32 v45, v46 s_waitcnt vmcnt(1) v_pk_fma_f32 v[1:2], v[45:46], v[32:33], v[1:2] v_pk_fma_f32 v[3:4], v[34:35], v[45:46], v[3:4] v_mov_b32_e32 v46, v47 s_waitcnt vmcnt(0) v_pk_fma_f32 v[1:2], v[46:47], v[36:37], v[1:2] v_pk_fma_f32 v[3:4], v[38:39], v[46:47], v[3:4] s_cbranch_scc0 .LBB5_14 ; %bb.15: ; %._crit_edge.loopexit.unr-lcssa.loopexit v_mov_b32_e32 v6, s5 v_add_co_u32_e32 v5, vcc, s4, v5 v_addc_co_u32_e32 v0, vcc, v0, v6, vcc s_and_b32 s4, s30, 7 s_cmp_eq_u32 s4, 0 s_cbranch_scc0 .LBB5_17 s_branch .LBB5_19 .LBB5_16: v_lshlrev_b32_e32 v0, 2, v7 s_mov_b32 s1, s0 v_mov_b32_e32 v3, s8 v_add_co_u32_e32 v5, vcc, s7, v0 v_mov_b64_e32 v[1:2], s[0:1] v_addc_co_u32_e32 v0, vcc, 0, v3, vcc v_mov_b64_e32 v[3:4], s[0:1] s_and_b32 s4, s30, 7 s_cmp_eq_u32 s4, 0 s_cbranch_scc1 .LBB5_19 .LBB5_17: ; %.epil.preheader.preheader v_add_co_u32_e32 v5, vcc, 8, v5 s_lshl_b32 s5, s0, 2 v_addc_co_u32_e32 v6, vcc, 0, v0, vcc s_lshl_b64 s[0:1], s[2:3], 2 .LBB5_18: ; %.epil.preheader ; =>This Inner Loop Header: Depth=1 global_load_dwordx4 v[8:11], v[5:6], off offset:-8 v_mov_b32_e32 v0, s5 ds_read_b32 v12, v0 v_mov_b32_e32 v13, s1 v_add_co_u32_e32 v5, vcc, s0, v5 s_add_i32 s5, s5, 4 s_add_i32 s4, s4, -1 v_addc_co_u32_e32 v6, vcc, v6, v13, vcc s_waitcnt lgkmcnt(0) v_mov_b32_e32 v13, v12 s_cmp_lg_u32 s4, 0 s_waitcnt vmcnt(0) v_pk_fma_f32 v[1:2], v[12:13], v[8:9], v[1:2] v_pk_fma_f32 v[3:4], v[10:11], v[12:13], v[3:4] s_cbranch_scc1 .LBB5_18 .LBB5_19: ; %._crit_edge v_cvt_pk_bf16_f32 v0, v1, v2 s_ashr_i32 s0, s28, 31 s_add_i32 s1, s28, s0 s_xor_b32 s1, s1, s0 v_and_b32_e32 v2, 0xffff0000, v0 v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD v_cvt_f32_u32_e32 v2, s1 s_mul_i32 s2, s12, s29 s_sub_i32 s2, s6, s2 s_ashr_i32 s3, s2, 31 v_rcp_iflag_f32_e32 v2, v2 s_add_i32 s4, s2, s3 s_xor_b32 s0, s3, s0 s_xor_b32 s3, s4, s3 v_mul_f32_e32 v2, 0x4f7ffffe, v2 v_cvt_u32_f32_e32 v2, v2 s_sub_i32 s4, 0, s1 v_cvt_pk_bf16_f32 v1, v3, v4 v_or3_b32 v0, 0, v0, 0 v_readfirstlane_b32 s5, v2 s_mul_i32 s4, s4, s5 s_mul_hi_u32 s4, s5, s4 s_add_i32 s5, s5, s4 s_mul_hi_u32 s4, s3, s5 s_mul_i32 s5, s4, s1 s_sub_i32 s3, s3, s5 s_add_i32 s5, s4, 1 s_sub_i32 s6, s3, s1 s_cmp_ge_u32 s3, s1 s_cselect_b32 s4, s5, s4 s_cselect_b32 s3, s6, s3 s_add_i32 s5, s4, 1 s_cmp_ge_u32 s3, s1 s_cselect_b32 s1, s5, s4 s_xor_b32 s1, s1, s0 s_sub_i32 s3, s1, s0 s_mul_i32 s0, s3, s28 s_sub_i32 s2, s2, s0 s_mul_i32 s0, s18, s13 s_mul_hi_u32 s1, s18, s12 s_add_i32 s0, s1, s0 s_mul_i32 s1, s19, s12 s_add_i32 s1, s0, s1 s_mul_i32 s0, s18, s12 s_lshl_b64 s[0:1], s[0:1], 1 s_add_u32 s4, s16, s0 s_addc_u32 s5, s17, s1 s_ashr_i32 s0, s3, 31 s_mul_i32 s0, s14, s0 s_mul_hi_u32 s1, s14, s3 s_add_i32 s0, s1, s0 s_mul_i32 s1, s15, s3 s_add_i32 s1, s0, s1 s_mul_i32 s0, s14, s3 s_lshl_b64 s[0:1], s[0:1], 1 s_add_u32 s3, s4, s0 s_addc_u32 s4, s5, s1 s_ashr_i32 s0, s2, 31 s_mul_i32 s0, s10, s0 s_mul_hi_u32 s1, s10, s2 s_add_i32 s0, s1, s0 s_mul_i32 s1, s11, s2 s_add_i32 s1, s0, s1 s_mul_i32 s0, s10, s2 s_lshl_b64 s[0:1], s[0:1], 1 v_and_b32_e32 v3, 0xffff, v1 v_and_b32_e32 v1, 0xffff0000, v1 s_add_u32 s0, s3, s0 v_or3_b32 v1, v3, 0, v1 s_addc_u32 s1, s4, s1 v_lshlrev_b32_e32 v2, 1, v7 global_store_dwordx2 v2, v[0:1], s[0:1] .LBB5_20: s_endpgm .LBB5_21: s_movk_i32 s2, 0x90 v_mov_b32_e32 v4, 0x4f800000 v_cmp_class_f32_e64 s[2:3], v5, s2 v_cndmask_b32_e64 v4, 1.0, v4, s[2:3] v_mul_f32_e32 v4, v5, v4 v_mov_b32_e32 v5, 0xc1b17218 v_log_f32_e32 v4, v4 v_cndmask_b32_e64 v5, 0, v5, s[2:3] v_fmac_f32_e32 v5, 0x3f317218, v4 v_add_f32_e32 v4, v5, v3 s_or_b64 exec, exec, s[8:9] v_cmp_eq_u32_e64 s[2:3], 0, v0 s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execz .LBB5_8 .LBB5_22: s_load_dwordx2 s[2:3], s[4:5], 0x60 s_lshl_b64 s[4:5], s[6:7], 2 v_mov_b32_e32 v3, 0 s_waitcnt lgkmcnt(0) s_add_u32 s2, s2, s4 s_addc_u32 s3, s3, s5 global_store_dword v3, v4, s[2:3] s_or_b64 exec, exec, s[8:9] s_and_saveexec_b64 s[2:3], vcc s_cbranch_execz .LBB5_9 .LBB5_23: v_sub_f32_e32 v2, v2, v4 s_mov_b32 s4, 0xc2aeac50 v_cmp_gt_f32_e32 vcc, s4, v2 v_mov_b32_e32 v5, 0x42800000 v_cndmask_b32_e32 v5, 0, v5, vcc v_add_f32_e32 v2, v2, v5 v_mul_f32_e32 v2, 0x3fb8aa3b, v2 v_mov_b32_e32 v3, 0x114b4ea4 v_exp_f32_e32 v2, v2 v_cndmask_b32_e32 v3, 1.0, v3, vcc v_mul_f32_e32 v2, v3, v2 v_lshlrev_b32_e32 v3, 2, v0 ds_write_b32 v3, v2 s_or_b64 exec, exec, s[2:3] s_and_b64 exec, exec, s[0:1] s_cbranch_execnz .LBB5_10 s_branch .LBB5_11 .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi96ELi128EEEv20Flash_fwd_mla_params .amdhsa_group_segment_fixed_size 384 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 336 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 52 .amdhsa_next_free_sgpr 32 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_bf16_overflow 0 .amdhsa_bf8_overflow 1 .amdhsa_fp8_overflow 1 .amdhsa_bf16_denorm_mode 3 .amdhsa_load_store_out_of_order 0 .amdhsa_matrix_excp 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi96ELi128EEEv20Flash_fwd_mla_params,#alloc,#execinstr .Lfunc_end5: .size _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi96ELi128EEEv20Flash_fwd_mla_params, .Lfunc_end5-_ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi96ELi128EEEv20Flash_fwd_mla_params ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 2564 ; NumSgprs: 36 ; NumVgprs: 52 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 384 bytes/workgroup (compile time only) ; SGPRBlocks: 4 ; VGPRBlocks: 12 ; NumSGPRsForWavesPerEU: 36 ; NumVGPRsForWavesPerEU: 52 ; Occupancy: 4 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF16_OVFL: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:FP8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:BF16_DENORM: 3 .section .text._ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi128ELi128EEEv20Flash_fwd_mla_params,#alloc,#execinstr .protected _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi128ELi128EEEv20Flash_fwd_mla_params ; -- Begin function _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi128ELi128EEEv20Flash_fwd_mla_params .globl _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi128ELi128EEEv20Flash_fwd_mla_params .p2align 8 .type _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi128ELi128EEEv20Flash_fwd_mla_params,@function _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi128ELi128EEEv20Flash_fwd_mla_params: ; @_ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi128ELi128EEEv20Flash_fwd_mla_params ; %bb.0: s_load_dword s28, s[4:5], 0x4 s_load_dword s29, s[4:5], 0x10 s_load_dwordx2 s[8:9], s[4:5], 0x120 s_ashr_i32 s7, s6, 31 s_waitcnt lgkmcnt(0) s_mul_i32 s29, s29, s28 s_ashr_i32 s0, s29, 31 s_add_i32 s1, s29, s0 s_xor_b32 s2, s1, s0 v_cvt_f32_u32_e32 v1, s2 s_xor_b32 s12, s7, s0 s_sub_i32 s0, 0, s2 s_add_i32 s1, s6, s7 v_rcp_iflag_f32_e32 v1, v1 s_xor_b32 s1, s1, s7 v_mul_f32_e32 v1, 0x4f7ffffe, v1 v_cvt_u32_f32_e32 v1, v1 v_readfirstlane_b32 s3, v1 s_mul_i32 s0, s0, s3 s_mul_hi_u32 s0, s3, s0 s_add_i32 s3, s3, s0 s_mul_hi_u32 s0, s1, s3 s_mul_i32 s3, s0, s2 s_sub_i32 s1, s1, s3 s_add_i32 s10, s0, 1 s_sub_i32 s3, s1, s2 s_cmp_ge_u32 s1, s2 s_cselect_b32 s13, s10, s0 s_cselect_b32 s3, s3, s1 s_add_i32 s14, s13, 1 s_cmp_ge_u32 s3, s2 s_cselect_b64 s[0:1], -1, 0 s_and_b64 s[10:11], s[0:1], exec s_cselect_b32 s10, s14, s13 s_xor_b32 s10, s10, s12 s_sub_i32 s12, s10, s12 s_ashr_i32 s13, s12, 31 s_lshl_b64 s[10:11], s[12:13], 2 s_add_u32 s8, s8, s10 s_addc_u32 s9, s9, s11 s_load_dwordx2 s[22:23], s[8:9], 0x0 s_waitcnt lgkmcnt(0) s_sub_i32 s30, s23, s22 s_cmp_eq_u32 s30, 1 s_cbranch_scc1 .LBB6_20 ; %bb.1: s_load_dwordx2 s[16:17], s[4:5], 0x58 s_load_dwordx2 s[18:19], s[4:5], 0x90 s_load_dwordx2 s[10:11], s[4:5], 0xc0 s_load_dwordx2 s[14:15], s[4:5], 0xf0 s_load_dwordx2 s[20:21], s[4:5], 0x130 s_sub_i32 s2, s3, s2 s_and_b64 s[0:1], s[0:1], exec s_cselect_b32 s0, s2, s3 s_xor_b32 s0, s0, s7 s_sub_i32 s0, s0, s7 s_mul_i32 s1, s22, s29 s_add_i32 s24, s0, s1 v_cmp_gt_u32_e32 vcc, 64, v0 s_and_saveexec_b64 s[26:27], vcc s_cbranch_execz .LBB6_11 ; %bb.2: ; %.preheader156.preheader s_load_dwordx2 s[0:1], s[4:5], 0x128 s_ashr_i32 s25, s24, 31 s_lshl_b64 s[2:3], s[24:25], 2 v_cmp_gt_i32_e32 vcc, s30, v0 v_mov_b32_e32 v1, 0xff800000 s_waitcnt lgkmcnt(0) s_add_u32 s25, s0, s2 s_addc_u32 s31, s1, s3 v_mov_b32_e32 v2, 0xff800000 s_and_saveexec_b64 s[2:3], vcc s_cbranch_execz .LBB6_4 ; %bb.3: v_mul_lo_u32 v2, v0, s29 v_mov_b32_e32 v4, s31 v_ashrrev_i32_e32 v3, 31, v2 v_lshlrev_b64 v[2:3], 2, v[2:3] v_add_co_u32_e64 v2, s[0:1], s25, v2 v_addc_co_u32_e64 v3, s[0:1], v4, v3, s[0:1] global_load_dword v2, v[2:3], off .LBB6_4: ; %.preheader156.1 s_or_b64 exec, exec, s[2:3] v_add_u32_e32 v3, 64, v0 v_cmp_gt_i32_e64 s[0:1], s30, v3 s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB6_6 ; %bb.5: v_mul_lo_u32 v3, v3, s29 v_mov_b32_e32 v1, s31 v_ashrrev_i32_e32 v4, 31, v3 v_lshlrev_b64 v[3:4], 2, v[3:4] v_add_co_u32_e64 v3, s[2:3], s25, v3 v_addc_co_u32_e64 v4, s[2:3], v1, v4, s[2:3] global_load_dword v1, v[3:4], off .LBB6_6: ; %.preheader155 s_or_b64 exec, exec, s[8:9] v_mbcnt_lo_u32_b32 v4, -1, 0 v_mbcnt_hi_u32_b32 v4, -1, v4 v_and_b32_e32 v5, 63, v4 v_and_b32_e32 v6, 64, v4 v_add_u32_e32 v6, 64, v6 v_xor_b32_e32 v7, 32, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 s_mov_b32 s8, 0xff800000 v_cndmask_b32_e64 v7, v4, v7, s[2:3] s_waitcnt vmcnt(0) v_max3_f32 v3, v2, s8, v1 v_lshlrev_b32_e32 v10, 2, v7 ds_bpermute_b32 v7, v10, v3 v_mov_b32_e32 v8, 0x42800000 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 v_xor_b32_e32 v7, 16, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 v_cndmask_b32_e64 v7, v4, v7, s[2:3] v_lshlrev_b32_e32 v11, 2, v7 ds_bpermute_b32 v7, v11, v3 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 v_xor_b32_e32 v7, 8, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 v_cndmask_b32_e64 v7, v4, v7, s[2:3] v_lshlrev_b32_e32 v12, 2, v7 ds_bpermute_b32 v7, v12, v3 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 v_xor_b32_e32 v7, 4, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 v_cndmask_b32_e64 v7, v4, v7, s[2:3] v_lshlrev_b32_e32 v13, 2, v7 ds_bpermute_b32 v7, v13, v3 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 v_xor_b32_e32 v7, 2, v5 v_cmp_lt_i32_e64 s[2:3], v7, v6 v_cndmask_b32_e64 v7, v4, v7, s[2:3] v_lshlrev_b32_e32 v14, 2, v7 ds_bpermute_b32 v7, v14, v3 v_xor_b32_e32 v5, 1, v5 v_cmp_lt_i32_e64 s[2:3], v5, v6 v_cndmask_b32_e64 v4, v4, v5, s[2:3] v_lshlrev_b32_e32 v15, 2, v4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v7 ds_bpermute_b32 v4, v15, v3 v_mov_b32_e32 v6, 0x114b4ea4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v3, v3, v4 v_cmp_lg_f32_e64 s[2:3], s8, v3 v_cndmask_b32_e64 v3, 0, v3, s[2:3] v_sub_f32_e32 v5, v2, v3 v_sub_f32_e32 v4, v1, v3 s_mov_b32 s8, 0xc2aeac50 v_cmp_gt_f32_e64 s[2:3], s8, v5 v_cmp_gt_f32_e64 s[8:9], s8, v4 v_cndmask_b32_e64 v9, 0, v8, s[2:3] v_cndmask_b32_e64 v8, 0, v8, s[8:9] v_pk_add_f32 v[4:5], v[4:5], v[8:9] v_mov_b32_e32 v8, 0x3fb8aa3b v_mov_b32_e32 v9, v8 v_pk_mul_f32 v[4:5], v[4:5], v[8:9] v_cndmask_b32_e64 v7, 1.0, v6, s[2:3] v_exp_f32_e32 v5, v5 v_exp_f32_e32 v4, v4 v_cndmask_b32_e64 v6, 1.0, v6, s[8:9] v_pk_mul_f32 v[4:5], v[6:7], v[4:5] v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v10, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v11, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v12, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v13, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v14, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v5 ds_bpermute_b32 v5, v15, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v5, v4, v5 v_cmp_lg_f32_e64 s[2:3], 0, v5 v_mov_b32_e32 v4, 0x7f800000 s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execnz .LBB6_21 ; %bb.7: s_or_b64 exec, exec, s[8:9] v_cmp_eq_u32_e64 s[2:3], 0, v0 s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execnz .LBB6_22 .LBB6_8: s_or_b64 exec, exec, s[8:9] s_and_saveexec_b64 s[2:3], vcc s_cbranch_execnz .LBB6_23 .LBB6_9: s_or_b64 exec, exec, s[2:3] s_and_b64 exec, exec, s[0:1] s_cbranch_execz .LBB6_11 .LBB6_10: v_sub_f32_e32 v1, v1, v4 s_mov_b32 s0, 0xc2aeac50 v_cmp_gt_f32_e32 vcc, s0, v1 v_mov_b32_e32 v3, 0x42800000 v_cndmask_b32_e32 v3, 0, v3, vcc v_add_f32_e32 v1, v1, v3 v_mul_f32_e32 v1, 0x3fb8aa3b, v1 v_mov_b32_e32 v2, 0x114b4ea4 v_exp_f32_e32 v1, v1 v_cndmask_b32_e32 v2, 1.0, v2, vcc v_mul_f32_e32 v1, v2, v1 v_lshlrev_b32_e32 v2, 2, v0 ds_write_b32 v2, v1 offset:256 .LBB6_11: ; %Flow245 s_or_b64 exec, exec, s[26:27] v_lshlrev_b32_e32 v7, 2, v0 s_mov_b32 s0, 0 s_cmp_lt_i32 s30, 1 v_mov_b32_e32 v2, 0 v_mov_b32_e32 v1, 0 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v3, 0 s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_cbranch_scc1 .LBB6_19 ; %bb.12: ; %.lr.ph s_lshl_b32 s2, s24, 9 s_ashr_i32 s3, s2, 31 s_lshl_b64 s[4:5], s[2:3], 2 s_add_u32 s7, s20, s4 s_addc_u32 s8, s21, s5 s_lshl_b32 s2, s29, 9 s_not_b32 s1, s22 s_ashr_i32 s3, s2, 31 s_add_i32 s9, s23, s1 s_cmp_lt_u32 s9, 7 s_cbranch_scc1 .LBB6_16 ; %bb.13: ; %.lr.ph.new v_lshlrev_b32_e32 v0, 4, v0 v_mov_b32_e32 v1, s21 v_add_co_u32_e32 v5, vcc, s20, v0 s_and_b32 s0, s30, -8 v_addc_co_u32_e32 v0, vcc, 0, v1, vcc s_lshl_b64 s[8:9], s[2:3], 5 s_mov_b32 s1, 0 s_mov_b32 s7, 0 v_mov_b32_e32 v3, 0 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v1, 0 v_mov_b32_e32 v2, 0 s_lshl_b64 s[20:21], s[2:3], 2 .LBB6_14: ; =>This Inner Loop Header: Depth=1 v_mov_b32_e32 v6, s5 v_add_co_u32_e32 v11, vcc, s4, v5 v_addc_co_u32_e32 v12, vcc, v0, v6, vcc v_mov_b32_e32 v38, s21 v_add_co_u32_e32 v20, vcc, s20, v11 v_addc_co_u32_e32 v21, vcc, v12, v38, vcc v_add_co_u32_e32 v22, vcc, s20, v20 v_addc_co_u32_e32 v23, vcc, v21, v38, vcc v_add_co_u32_e32 v28, vcc, s20, v22 v_addc_co_u32_e32 v29, vcc, v23, v38, vcc v_add_co_u32_e32 v30, vcc, s20, v28 global_load_dwordx4 v[8:11], v[11:12], off v_addc_co_u32_e32 v31, vcc, v29, v38, vcc v_add_co_u32_e32 v32, vcc, s20, v30 global_load_dwordx4 v[12:15], v[20:21], off global_load_dwordx4 v[16:19], v[22:23], off v_addc_co_u32_e32 v33, vcc, v31, v38, vcc v_add_co_u32_e32 v35, vcc, s20, v32 global_load_dwordx4 v[20:23], v[28:29], off global_load_dwordx4 v[24:27], v[30:31], off v_addc_co_u32_e32 v36, vcc, v33, v38, vcc global_load_dwordx4 v[28:31], v[32:33], off v_add_co_u32_e32 v37, vcc, s20, v35 global_load_dwordx4 v[32:35], v[35:36], off v_addc_co_u32_e32 v38, vcc, v36, v38, vcc global_load_dwordx4 v[36:39], v[37:38], off v_mov_b32_e32 v6, s1 ds_read_b128 v[40:43], v6 ds_read_b128 v[44:47], v6 offset:16 v_mov_b32_e32 v48, s9 v_add_co_u32_e32 v5, vcc, s8, v5 v_addc_co_u32_e32 v0, vcc, v0, v48, vcc s_waitcnt lgkmcnt(1) v_mov_b32_e32 v48, v40 v_mov_b32_e32 v49, v40 v_mov_b32_e32 v40, v41 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v50, v44 v_mov_b32_e32 v51, v44 v_mov_b32_e32 v44, v45 s_add_i32 s7, s7, 8 s_add_i32 s1, s1, 32 s_cmp_eq_u32 s0, s7 s_waitcnt vmcnt(7) v_pk_fma_f32 v[1:2], v[48:49], v[8:9], v[1:2] v_pk_fma_f32 v[3:4], v[10:11], v[48:49], v[3:4] s_waitcnt vmcnt(6) v_pk_fma_f32 v[1:2], v[40:41], v[12:13], v[1:2] v_pk_fma_f32 v[3:4], v[14:15], v[40:41], v[3:4] v_mov_b32_e32 v41, v42 s_waitcnt vmcnt(5) v_pk_fma_f32 v[1:2], v[41:42], v[16:17], v[1:2] v_pk_fma_f32 v[3:4], v[18:19], v[41:42], v[3:4] v_mov_b32_e32 v42, v43 s_waitcnt vmcnt(4) v_pk_fma_f32 v[1:2], v[42:43], v[20:21], v[1:2] v_pk_fma_f32 v[3:4], v[22:23], v[42:43], v[3:4] s_waitcnt vmcnt(3) v_pk_fma_f32 v[1:2], v[50:51], v[24:25], v[1:2] v_pk_fma_f32 v[3:4], v[26:27], v[50:51], v[3:4] s_waitcnt vmcnt(2) v_pk_fma_f32 v[1:2], v[44:45], v[28:29], v[1:2] v_pk_fma_f32 v[3:4], v[30:31], v[44:45], v[3:4] v_mov_b32_e32 v45, v46 s_waitcnt vmcnt(1) v_pk_fma_f32 v[1:2], v[45:46], v[32:33], v[1:2] v_pk_fma_f32 v[3:4], v[34:35], v[45:46], v[3:4] v_mov_b32_e32 v46, v47 s_waitcnt vmcnt(0) v_pk_fma_f32 v[1:2], v[46:47], v[36:37], v[1:2] v_pk_fma_f32 v[3:4], v[38:39], v[46:47], v[3:4] s_cbranch_scc0 .LBB6_14 ; %bb.15: ; %._crit_edge.loopexit.unr-lcssa.loopexit v_mov_b32_e32 v6, s5 v_add_co_u32_e32 v5, vcc, s4, v5 v_addc_co_u32_e32 v0, vcc, v0, v6, vcc s_and_b32 s4, s30, 7 s_cmp_eq_u32 s4, 0 s_cbranch_scc0 .LBB6_17 s_branch .LBB6_19 .LBB6_16: v_lshlrev_b32_e32 v0, 2, v7 s_mov_b32 s1, s0 v_mov_b32_e32 v3, s8 v_add_co_u32_e32 v5, vcc, s7, v0 v_mov_b64_e32 v[1:2], s[0:1] v_addc_co_u32_e32 v0, vcc, 0, v3, vcc v_mov_b64_e32 v[3:4], s[0:1] s_and_b32 s4, s30, 7 s_cmp_eq_u32 s4, 0 s_cbranch_scc1 .LBB6_19 .LBB6_17: ; %.epil.preheader.preheader v_add_co_u32_e32 v5, vcc, 8, v5 s_lshl_b32 s5, s0, 2 v_addc_co_u32_e32 v6, vcc, 0, v0, vcc s_lshl_b64 s[0:1], s[2:3], 2 .LBB6_18: ; %.epil.preheader ; =>This Inner Loop Header: Depth=1 global_load_dwordx4 v[8:11], v[5:6], off offset:-8 v_mov_b32_e32 v0, s5 ds_read_b32 v12, v0 v_mov_b32_e32 v13, s1 v_add_co_u32_e32 v5, vcc, s0, v5 s_add_i32 s5, s5, 4 s_add_i32 s4, s4, -1 v_addc_co_u32_e32 v6, vcc, v6, v13, vcc s_waitcnt lgkmcnt(0) v_mov_b32_e32 v13, v12 s_cmp_lg_u32 s4, 0 s_waitcnt vmcnt(0) v_pk_fma_f32 v[1:2], v[12:13], v[8:9], v[1:2] v_pk_fma_f32 v[3:4], v[10:11], v[12:13], v[3:4] s_cbranch_scc1 .LBB6_18 .LBB6_19: ; %._crit_edge v_cvt_pk_bf16_f32 v0, v1, v2 s_ashr_i32 s0, s28, 31 s_add_i32 s1, s28, s0 s_xor_b32 s1, s1, s0 v_and_b32_e32 v2, 0xffff0000, v0 v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD v_cvt_f32_u32_e32 v2, s1 s_mul_i32 s2, s12, s29 s_sub_i32 s2, s6, s2 s_ashr_i32 s3, s2, 31 v_rcp_iflag_f32_e32 v2, v2 s_add_i32 s4, s2, s3 s_xor_b32 s0, s3, s0 s_xor_b32 s3, s4, s3 v_mul_f32_e32 v2, 0x4f7ffffe, v2 v_cvt_u32_f32_e32 v2, v2 s_sub_i32 s4, 0, s1 v_cvt_pk_bf16_f32 v1, v3, v4 v_or3_b32 v0, 0, v0, 0 v_readfirstlane_b32 s5, v2 s_mul_i32 s4, s4, s5 s_mul_hi_u32 s4, s5, s4 s_add_i32 s5, s5, s4 s_mul_hi_u32 s4, s3, s5 s_mul_i32 s5, s4, s1 s_sub_i32 s3, s3, s5 s_add_i32 s5, s4, 1 s_sub_i32 s6, s3, s1 s_cmp_ge_u32 s3, s1 s_cselect_b32 s4, s5, s4 s_cselect_b32 s3, s6, s3 s_add_i32 s5, s4, 1 s_cmp_ge_u32 s3, s1 s_cselect_b32 s1, s5, s4 s_xor_b32 s1, s1, s0 s_sub_i32 s3, s1, s0 s_mul_i32 s0, s3, s28 s_sub_i32 s2, s2, s0 s_mul_i32 s0, s18, s13 s_mul_hi_u32 s1, s18, s12 s_add_i32 s0, s1, s0 s_mul_i32 s1, s19, s12 s_add_i32 s1, s0, s1 s_mul_i32 s0, s18, s12 s_lshl_b64 s[0:1], s[0:1], 1 s_add_u32 s4, s16, s0 s_addc_u32 s5, s17, s1 s_ashr_i32 s0, s3, 31 s_mul_i32 s0, s14, s0 s_mul_hi_u32 s1, s14, s3 s_add_i32 s0, s1, s0 s_mul_i32 s1, s15, s3 s_add_i32 s1, s0, s1 s_mul_i32 s0, s14, s3 s_lshl_b64 s[0:1], s[0:1], 1 s_add_u32 s3, s4, s0 s_addc_u32 s4, s5, s1 s_ashr_i32 s0, s2, 31 s_mul_i32 s0, s10, s0 s_mul_hi_u32 s1, s10, s2 s_add_i32 s0, s1, s0 s_mul_i32 s1, s11, s2 s_add_i32 s1, s0, s1 s_mul_i32 s0, s10, s2 s_lshl_b64 s[0:1], s[0:1], 1 v_and_b32_e32 v3, 0xffff, v1 v_and_b32_e32 v1, 0xffff0000, v1 s_add_u32 s0, s3, s0 v_or3_b32 v1, v3, 0, v1 s_addc_u32 s1, s4, s1 v_lshlrev_b32_e32 v2, 1, v7 global_store_dwordx2 v2, v[0:1], s[0:1] .LBB6_20: s_endpgm .LBB6_21: s_movk_i32 s2, 0x90 v_mov_b32_e32 v4, 0x4f800000 v_cmp_class_f32_e64 s[2:3], v5, s2 v_cndmask_b32_e64 v4, 1.0, v4, s[2:3] v_mul_f32_e32 v4, v5, v4 v_mov_b32_e32 v5, 0xc1b17218 v_log_f32_e32 v4, v4 v_cndmask_b32_e64 v5, 0, v5, s[2:3] v_fmac_f32_e32 v5, 0x3f317218, v4 v_add_f32_e32 v4, v5, v3 s_or_b64 exec, exec, s[8:9] v_cmp_eq_u32_e64 s[2:3], 0, v0 s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execz .LBB6_8 .LBB6_22: s_load_dwordx2 s[2:3], s[4:5], 0x60 s_lshl_b64 s[4:5], s[6:7], 2 v_mov_b32_e32 v3, 0 s_waitcnt lgkmcnt(0) s_add_u32 s2, s2, s4 s_addc_u32 s3, s3, s5 global_store_dword v3, v4, s[2:3] s_or_b64 exec, exec, s[8:9] s_and_saveexec_b64 s[2:3], vcc s_cbranch_execz .LBB6_9 .LBB6_23: v_sub_f32_e32 v2, v2, v4 s_mov_b32 s4, 0xc2aeac50 v_cmp_gt_f32_e32 vcc, s4, v2 v_mov_b32_e32 v5, 0x42800000 v_cndmask_b32_e32 v5, 0, v5, vcc v_add_f32_e32 v2, v2, v5 v_mul_f32_e32 v2, 0x3fb8aa3b, v2 v_mov_b32_e32 v3, 0x114b4ea4 v_exp_f32_e32 v2, v2 v_cndmask_b32_e32 v3, 1.0, v3, vcc v_mul_f32_e32 v2, v3, v2 v_lshlrev_b32_e32 v3, 2, v0 ds_write_b32 v3, v2 s_or_b64 exec, exec, s[2:3] s_and_b64 exec, exec, s[0:1] s_cbranch_execnz .LBB6_10 s_branch .LBB6_11 .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi128ELi128EEEv20Flash_fwd_mla_params .amdhsa_group_segment_fixed_size 512 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 336 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 52 .amdhsa_next_free_sgpr 32 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_bf16_overflow 0 .amdhsa_bf8_overflow 1 .amdhsa_fp8_overflow 1 .amdhsa_bf16_denorm_mode 3 .amdhsa_load_store_out_of_order 0 .amdhsa_matrix_excp 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi128ELi128EEEv20Flash_fwd_mla_params,#alloc,#execinstr .Lfunc_end6: .size _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi128ELi128EEEv20Flash_fwd_mla_params, .Lfunc_end6-_ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi128ELi128EEEv20Flash_fwd_mla_params ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 2564 ; NumSgprs: 36 ; NumVgprs: 52 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 512 bytes/workgroup (compile time only) ; SGPRBlocks: 4 ; VGPRBlocks: 12 ; NumSGPRsForWavesPerEU: 36 ; NumVGPRsForWavesPerEU: 52 ; Occupancy: 4 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF16_OVFL: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:FP8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:BF16_DENORM: 3 .section .text._ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi144ELi128EEEv20Flash_fwd_mla_params,#alloc,#execinstr .protected _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi144ELi128EEEv20Flash_fwd_mla_params ; -- Begin function _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi144ELi128EEEv20Flash_fwd_mla_params .globl _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi144ELi128EEEv20Flash_fwd_mla_params .p2align 8 .type _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi144ELi128EEEv20Flash_fwd_mla_params,@function _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi144ELi128EEEv20Flash_fwd_mla_params: ; @_ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi144ELi128EEEv20Flash_fwd_mla_params ; %bb.0: s_load_dword s30, s[4:5], 0x4 s_load_dword s31, s[4:5], 0x10 s_load_dwordx2 s[8:9], s[4:5], 0x120 s_ashr_i32 s7, s6, 31 s_waitcnt lgkmcnt(0) s_mul_i32 s31, s31, s30 s_ashr_i32 s0, s31, 31 s_add_i32 s1, s31, s0 s_xor_b32 s2, s1, s0 v_cvt_f32_u32_e32 v1, s2 s_xor_b32 s12, s7, s0 s_sub_i32 s0, 0, s2 s_add_i32 s1, s6, s7 v_rcp_iflag_f32_e32 v1, v1 s_xor_b32 s1, s1, s7 v_mul_f32_e32 v1, 0x4f7ffffe, v1 v_cvt_u32_f32_e32 v1, v1 v_readfirstlane_b32 s3, v1 s_mul_i32 s0, s0, s3 s_mul_hi_u32 s0, s3, s0 s_add_i32 s3, s3, s0 s_mul_hi_u32 s0, s1, s3 s_mul_i32 s3, s0, s2 s_sub_i32 s1, s1, s3 s_add_i32 s10, s0, 1 s_sub_i32 s3, s1, s2 s_cmp_ge_u32 s1, s2 s_cselect_b32 s13, s10, s0 s_cselect_b32 s3, s3, s1 s_add_i32 s14, s13, 1 s_cmp_ge_u32 s3, s2 s_cselect_b64 s[0:1], -1, 0 s_and_b64 s[10:11], s[0:1], exec s_cselect_b32 s10, s14, s13 s_xor_b32 s10, s10, s12 s_sub_i32 s14, s10, s12 s_ashr_i32 s15, s14, 31 s_lshl_b64 s[10:11], s[14:15], 2 s_add_u32 s8, s8, s10 s_addc_u32 s9, s9, s11 s_load_dwordx2 s[24:25], s[8:9], 0x0 s_waitcnt lgkmcnt(0) s_sub_i32 s33, s25, s24 s_cmp_eq_u32 s33, 1 s_cbranch_scc1 .LBB7_23 ; %bb.1: s_load_dwordx2 s[18:19], s[4:5], 0x58 s_load_dwordx2 s[20:21], s[4:5], 0x90 s_load_dwordx2 s[12:13], s[4:5], 0xc0 s_load_dwordx2 s[16:17], s[4:5], 0xf0 s_load_dwordx2 s[22:23], s[4:5], 0x130 s_sub_i32 s2, s3, s2 s_and_b64 s[0:1], s[0:1], exec s_cselect_b32 s0, s2, s3 s_xor_b32 s0, s0, s7 s_sub_i32 s0, s0, s7 s_mul_i32 s1, s24, s31 s_add_i32 s26, s0, s1 v_cmp_gt_u32_e32 vcc, 64, v0 s_and_saveexec_b64 s[28:29], vcc s_cbranch_execz .LBB7_14 ; %bb.2: ; %.preheader156.preheader s_load_dwordx2 s[0:1], s[4:5], 0x128 s_ashr_i32 s27, s26, 31 s_lshl_b64 s[2:3], s[26:27], 2 v_cmp_gt_i32_e32 vcc, s33, v0 v_mov_b32_e32 v1, 0xff800000 s_waitcnt lgkmcnt(0) s_add_u32 s27, s0, s2 s_addc_u32 s34, s1, s3 v_mov_b32_e32 v2, 0xff800000 s_and_saveexec_b64 s[2:3], vcc s_cbranch_execz .LBB7_4 ; %bb.3: v_mul_lo_u32 v2, v0, s31 v_mov_b32_e32 v4, s34 v_ashrrev_i32_e32 v3, 31, v2 v_lshlrev_b64 v[2:3], 2, v[2:3] v_add_co_u32_e64 v2, s[0:1], s27, v2 v_addc_co_u32_e64 v3, s[0:1], v4, v3, s[0:1] global_load_dword v2, v[2:3], off .LBB7_4: ; %.preheader156.1 s_or_b64 exec, exec, s[2:3] v_add_u32_e32 v3, 64, v0 v_cmp_gt_i32_e64 s[0:1], s33, v3 s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB7_6 ; %bb.5: v_mul_lo_u32 v3, v3, s31 v_mov_b32_e32 v1, s34 v_ashrrev_i32_e32 v4, 31, v3 v_lshlrev_b64 v[3:4], 2, v[3:4] v_add_co_u32_e64 v3, s[2:3], s27, v3 v_addc_co_u32_e64 v4, s[2:3], v1, v4, s[2:3] global_load_dword v1, v[3:4], off .LBB7_6: ; %.preheader156.2 s_or_b64 exec, exec, s[8:9] v_or_b32_e32 v4, 0x80, v0 v_cmp_gt_i32_e64 s[2:3], s33, v4 v_mov_b32_e32 v3, 0xff800000 s_and_saveexec_b64 s[10:11], s[2:3] s_cbranch_execz .LBB7_8 ; %bb.7: v_mul_lo_u32 v3, v4, s31 v_mov_b32_e32 v5, s34 v_ashrrev_i32_e32 v4, 31, v3 v_lshlrev_b64 v[3:4], 2, v[3:4] v_add_co_u32_e64 v3, s[8:9], s27, v3 v_addc_co_u32_e64 v4, s[8:9], v5, v4, s[8:9] global_load_dword v3, v[3:4], off .LBB7_8: ; %.preheader155 s_or_b64 exec, exec, s[10:11] v_mbcnt_lo_u32_b32 v5, -1, 0 v_mbcnt_hi_u32_b32 v5, -1, v5 v_and_b32_e32 v6, 63, v5 v_and_b32_e32 v7, 64, v5 v_add_u32_e32 v7, 64, v7 v_xor_b32_e32 v8, 32, v6 v_cmp_lt_i32_e64 s[8:9], v8, v7 s_waitcnt vmcnt(0) v_max_f32_e32 v4, 0xff800000, v2 v_cndmask_b32_e64 v8, v5, v8, s[8:9] v_max3_f32 v4, v4, v1, v3 v_lshlrev_b32_e32 v12, 2, v8 ds_bpermute_b32 v8, v12, v4 s_mov_b32 s10, 0xc2aeac50 v_mov_b32_e32 v10, 0x42800000 s_waitcnt lgkmcnt(0) v_max_f32_e32 v4, v4, v8 v_xor_b32_e32 v8, 16, v6 v_cmp_lt_i32_e64 s[8:9], v8, v7 v_cndmask_b32_e64 v8, v5, v8, s[8:9] v_lshlrev_b32_e32 v13, 2, v8 ds_bpermute_b32 v8, v13, v4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v4, v4, v8 v_xor_b32_e32 v8, 8, v6 v_cmp_lt_i32_e64 s[8:9], v8, v7 v_cndmask_b32_e64 v8, v5, v8, s[8:9] v_lshlrev_b32_e32 v14, 2, v8 ds_bpermute_b32 v8, v14, v4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v4, v4, v8 v_xor_b32_e32 v8, 4, v6 v_cmp_lt_i32_e64 s[8:9], v8, v7 v_cndmask_b32_e64 v8, v5, v8, s[8:9] v_lshlrev_b32_e32 v15, 2, v8 ds_bpermute_b32 v8, v15, v4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v4, v4, v8 v_xor_b32_e32 v8, 2, v6 v_cmp_lt_i32_e64 s[8:9], v8, v7 v_cndmask_b32_e64 v8, v5, v8, s[8:9] v_lshlrev_b32_e32 v16, 2, v8 ds_bpermute_b32 v8, v16, v4 v_xor_b32_e32 v6, 1, v6 v_cmp_lt_i32_e64 s[8:9], v6, v7 v_cndmask_b32_e64 v5, v5, v6, s[8:9] v_lshlrev_b32_e32 v17, 2, v5 s_waitcnt lgkmcnt(0) v_max_f32_e32 v4, v4, v8 ds_bpermute_b32 v5, v17, v4 s_mov_b32 s8, 0xff800000 v_mov_b32_e32 v8, 0x114b4ea4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v4, v4, v5 v_cmp_lg_f32_e64 s[8:9], s8, v4 v_cndmask_b32_e64 v5, 0, v4, s[8:9] v_sub_f32_e32 v4, v2, v5 v_cmp_gt_f32_e64 s[8:9], s10, v4 v_cndmask_b32_e64 v6, 0, v10, s[8:9] v_add_f32_e32 v4, v4, v6 v_sub_f32_e32 v7, v1, v5 v_sub_f32_e32 v6, v3, v5 v_cndmask_b32_e64 v18, 1.0, v8, s[8:9] v_cmp_gt_f32_e64 s[8:9], s10, v7 v_cmp_gt_f32_e64 s[10:11], s10, v6 v_cndmask_b32_e64 v11, 0, v10, s[8:9] v_cndmask_b32_e64 v10, 0, v10, s[10:11] v_pk_add_f32 v[6:7], v[6:7], v[10:11] v_mov_b32_e32 v10, 0x3fb8aa3b v_mov_b32_e32 v11, v10 v_pk_mul_f32 v[6:7], v[6:7], v[10:11] v_mul_f32_e32 v4, 0x3fb8aa3b, v4 v_exp_f32_e32 v7, v7 v_exp_f32_e32 v6, v6 v_exp_f32_e32 v4, v4 v_cndmask_b32_e64 v9, 1.0, v8, s[8:9] v_cndmask_b32_e64 v8, 1.0, v8, s[10:11] v_pk_mul_f32 v[6:7], v[8:9], v[6:7] v_fma_f32 v4, v18, v4, v7 v_add_f32_e32 v4, v6, v4 ds_bpermute_b32 v6, v12, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v6 ds_bpermute_b32 v6, v13, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v6 ds_bpermute_b32 v6, v14, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v6 ds_bpermute_b32 v6, v15, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v6 ds_bpermute_b32 v6, v16, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v6 ds_bpermute_b32 v6, v17, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v6, v4, v6 v_cmp_lg_f32_e64 s[8:9], 0, v6 v_mov_b32_e32 v4, 0x7f800000 s_and_saveexec_b64 s[10:11], s[8:9] s_cbranch_execnz .LBB7_24 ; %bb.9: s_or_b64 exec, exec, s[10:11] v_cmp_eq_u32_e64 s[8:9], 0, v0 s_and_saveexec_b64 s[10:11], s[8:9] s_cbranch_execnz .LBB7_25 .LBB7_10: s_or_b64 exec, exec, s[10:11] s_and_saveexec_b64 s[4:5], vcc s_cbranch_execnz .LBB7_26 .LBB7_11: s_or_b64 exec, exec, s[4:5] s_and_saveexec_b64 s[4:5], s[0:1] s_cbranch_execnz .LBB7_27 .LBB7_12: s_or_b64 exec, exec, s[4:5] s_and_b64 exec, exec, s[2:3] s_cbranch_execz .LBB7_14 .LBB7_13: v_sub_f32_e32 v1, v3, v4 s_mov_b32 s0, 0xc2aeac50 v_cmp_gt_f32_e32 vcc, s0, v1 v_mov_b32_e32 v3, 0x42800000 v_cndmask_b32_e32 v3, 0, v3, vcc v_add_f32_e32 v1, v1, v3 v_mul_f32_e32 v1, 0x3fb8aa3b, v1 v_mov_b32_e32 v2, 0x114b4ea4 v_exp_f32_e32 v1, v1 v_cndmask_b32_e32 v2, 1.0, v2, vcc v_mul_f32_e32 v1, v2, v1 v_lshlrev_b32_e32 v2, 2, v0 ds_write_b32 v2, v1 offset:512 .LBB7_14: ; %Flow245 s_or_b64 exec, exec, s[28:29] v_lshlrev_b32_e32 v7, 2, v0 s_mov_b32 s0, 0 s_cmp_lt_i32 s33, 1 v_mov_b32_e32 v2, 0 v_mov_b32_e32 v1, 0 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v3, 0 s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_cbranch_scc1 .LBB7_22 ; %bb.15: ; %.lr.ph s_lshl_b32 s2, s26, 9 s_ashr_i32 s3, s2, 31 s_lshl_b64 s[4:5], s[2:3], 2 s_add_u32 s7, s22, s4 s_addc_u32 s8, s23, s5 s_lshl_b32 s2, s31, 9 s_not_b32 s1, s24 s_ashr_i32 s3, s2, 31 s_add_i32 s9, s25, s1 s_cmp_lt_u32 s9, 7 s_cbranch_scc1 .LBB7_19 ; %bb.16: ; %.lr.ph.new v_lshlrev_b32_e32 v0, 4, v0 v_mov_b32_e32 v1, s23 v_add_co_u32_e32 v5, vcc, s22, v0 s_and_b32 s0, s33, -8 v_addc_co_u32_e32 v0, vcc, 0, v1, vcc s_lshl_b64 s[8:9], s[2:3], 5 s_mov_b32 s1, 0 s_mov_b32 s7, 0 v_mov_b32_e32 v3, 0 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v1, 0 v_mov_b32_e32 v2, 0 s_lshl_b64 s[10:11], s[2:3], 2 .LBB7_17: ; =>This Inner Loop Header: Depth=1 v_mov_b32_e32 v6, s5 v_add_co_u32_e32 v11, vcc, s4, v5 v_addc_co_u32_e32 v12, vcc, v0, v6, vcc v_mov_b32_e32 v38, s11 v_add_co_u32_e32 v20, vcc, s10, v11 v_addc_co_u32_e32 v21, vcc, v12, v38, vcc v_add_co_u32_e32 v22, vcc, s10, v20 v_addc_co_u32_e32 v23, vcc, v21, v38, vcc v_add_co_u32_e32 v28, vcc, s10, v22 v_addc_co_u32_e32 v29, vcc, v23, v38, vcc v_add_co_u32_e32 v30, vcc, s10, v28 global_load_dwordx4 v[8:11], v[11:12], off v_addc_co_u32_e32 v31, vcc, v29, v38, vcc v_add_co_u32_e32 v32, vcc, s10, v30 global_load_dwordx4 v[12:15], v[20:21], off global_load_dwordx4 v[16:19], v[22:23], off v_addc_co_u32_e32 v33, vcc, v31, v38, vcc v_add_co_u32_e32 v35, vcc, s10, v32 global_load_dwordx4 v[20:23], v[28:29], off global_load_dwordx4 v[24:27], v[30:31], off v_addc_co_u32_e32 v36, vcc, v33, v38, vcc global_load_dwordx4 v[28:31], v[32:33], off v_add_co_u32_e32 v37, vcc, s10, v35 global_load_dwordx4 v[32:35], v[35:36], off v_addc_co_u32_e32 v38, vcc, v36, v38, vcc global_load_dwordx4 v[36:39], v[37:38], off v_mov_b32_e32 v6, s1 ds_read_b128 v[40:43], v6 ds_read_b128 v[44:47], v6 offset:16 v_mov_b32_e32 v48, s9 v_add_co_u32_e32 v5, vcc, s8, v5 v_addc_co_u32_e32 v0, vcc, v0, v48, vcc s_waitcnt lgkmcnt(1) v_mov_b32_e32 v48, v40 v_mov_b32_e32 v49, v40 v_mov_b32_e32 v40, v41 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v50, v44 v_mov_b32_e32 v51, v44 v_mov_b32_e32 v44, v45 s_add_i32 s7, s7, 8 s_add_i32 s1, s1, 32 s_cmp_eq_u32 s0, s7 s_waitcnt vmcnt(7) v_pk_fma_f32 v[1:2], v[48:49], v[8:9], v[1:2] v_pk_fma_f32 v[3:4], v[10:11], v[48:49], v[3:4] s_waitcnt vmcnt(6) v_pk_fma_f32 v[1:2], v[40:41], v[12:13], v[1:2] v_pk_fma_f32 v[3:4], v[14:15], v[40:41], v[3:4] v_mov_b32_e32 v41, v42 s_waitcnt vmcnt(5) v_pk_fma_f32 v[1:2], v[41:42], v[16:17], v[1:2] v_pk_fma_f32 v[3:4], v[18:19], v[41:42], v[3:4] v_mov_b32_e32 v42, v43 s_waitcnt vmcnt(4) v_pk_fma_f32 v[1:2], v[42:43], v[20:21], v[1:2] v_pk_fma_f32 v[3:4], v[22:23], v[42:43], v[3:4] s_waitcnt vmcnt(3) v_pk_fma_f32 v[1:2], v[50:51], v[24:25], v[1:2] v_pk_fma_f32 v[3:4], v[26:27], v[50:51], v[3:4] s_waitcnt vmcnt(2) v_pk_fma_f32 v[1:2], v[44:45], v[28:29], v[1:2] v_pk_fma_f32 v[3:4], v[30:31], v[44:45], v[3:4] v_mov_b32_e32 v45, v46 s_waitcnt vmcnt(1) v_pk_fma_f32 v[1:2], v[45:46], v[32:33], v[1:2] v_pk_fma_f32 v[3:4], v[34:35], v[45:46], v[3:4] v_mov_b32_e32 v46, v47 s_waitcnt vmcnt(0) v_pk_fma_f32 v[1:2], v[46:47], v[36:37], v[1:2] v_pk_fma_f32 v[3:4], v[38:39], v[46:47], v[3:4] s_cbranch_scc0 .LBB7_17 ; %bb.18: ; %._crit_edge.loopexit.unr-lcssa.loopexit v_mov_b32_e32 v6, s5 v_add_co_u32_e32 v5, vcc, s4, v5 v_addc_co_u32_e32 v0, vcc, v0, v6, vcc s_and_b32 s4, s33, 7 s_cmp_eq_u32 s4, 0 s_cbranch_scc0 .LBB7_20 s_branch .LBB7_22 .LBB7_19: v_lshlrev_b32_e32 v0, 2, v7 s_mov_b32 s1, s0 v_mov_b32_e32 v3, s8 v_add_co_u32_e32 v5, vcc, s7, v0 v_mov_b64_e32 v[1:2], s[0:1] v_addc_co_u32_e32 v0, vcc, 0, v3, vcc v_mov_b64_e32 v[3:4], s[0:1] s_and_b32 s4, s33, 7 s_cmp_eq_u32 s4, 0 s_cbranch_scc1 .LBB7_22 .LBB7_20: ; %.epil.preheader.preheader v_add_co_u32_e32 v5, vcc, 8, v5 s_lshl_b32 s5, s0, 2 v_addc_co_u32_e32 v6, vcc, 0, v0, vcc s_lshl_b64 s[0:1], s[2:3], 2 .LBB7_21: ; %.epil.preheader ; =>This Inner Loop Header: Depth=1 global_load_dwordx4 v[8:11], v[5:6], off offset:-8 v_mov_b32_e32 v0, s5 ds_read_b32 v12, v0 v_mov_b32_e32 v13, s1 v_add_co_u32_e32 v5, vcc, s0, v5 s_add_i32 s5, s5, 4 s_add_i32 s4, s4, -1 v_addc_co_u32_e32 v6, vcc, v6, v13, vcc s_waitcnt lgkmcnt(0) v_mov_b32_e32 v13, v12 s_cmp_lg_u32 s4, 0 s_waitcnt vmcnt(0) v_pk_fma_f32 v[1:2], v[12:13], v[8:9], v[1:2] v_pk_fma_f32 v[3:4], v[10:11], v[12:13], v[3:4] s_cbranch_scc1 .LBB7_21 .LBB7_22: ; %._crit_edge v_cvt_pk_bf16_f32 v0, v1, v2 s_ashr_i32 s0, s30, 31 s_add_i32 s1, s30, s0 s_xor_b32 s1, s1, s0 v_and_b32_e32 v2, 0xffff0000, v0 v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD v_cvt_f32_u32_e32 v2, s1 s_mul_i32 s2, s14, s31 s_sub_i32 s2, s6, s2 s_ashr_i32 s3, s2, 31 v_rcp_iflag_f32_e32 v2, v2 s_add_i32 s4, s2, s3 s_xor_b32 s0, s3, s0 s_xor_b32 s3, s4, s3 v_mul_f32_e32 v2, 0x4f7ffffe, v2 v_cvt_u32_f32_e32 v2, v2 s_sub_i32 s4, 0, s1 v_cvt_pk_bf16_f32 v1, v3, v4 v_or3_b32 v0, 0, v0, 0 v_readfirstlane_b32 s5, v2 s_mul_i32 s4, s4, s5 s_mul_hi_u32 s4, s5, s4 s_add_i32 s5, s5, s4 s_mul_hi_u32 s4, s3, s5 s_mul_i32 s5, s4, s1 s_sub_i32 s3, s3, s5 s_add_i32 s5, s4, 1 s_sub_i32 s6, s3, s1 s_cmp_ge_u32 s3, s1 s_cselect_b32 s4, s5, s4 s_cselect_b32 s3, s6, s3 s_add_i32 s5, s4, 1 s_cmp_ge_u32 s3, s1 s_cselect_b32 s1, s5, s4 s_xor_b32 s1, s1, s0 s_sub_i32 s3, s1, s0 s_mul_i32 s0, s3, s30 s_sub_i32 s2, s2, s0 s_mul_i32 s0, s20, s15 s_mul_hi_u32 s1, s20, s14 s_add_i32 s0, s1, s0 s_mul_i32 s1, s21, s14 s_add_i32 s1, s0, s1 s_mul_i32 s0, s20, s14 s_lshl_b64 s[0:1], s[0:1], 1 s_add_u32 s4, s18, s0 s_addc_u32 s5, s19, s1 s_ashr_i32 s0, s3, 31 s_mul_i32 s0, s16, s0 s_mul_hi_u32 s1, s16, s3 s_add_i32 s0, s1, s0 s_mul_i32 s1, s17, s3 s_add_i32 s1, s0, s1 s_mul_i32 s0, s16, s3 s_lshl_b64 s[0:1], s[0:1], 1 s_add_u32 s3, s4, s0 s_addc_u32 s4, s5, s1 s_ashr_i32 s0, s2, 31 s_mul_i32 s0, s12, s0 s_mul_hi_u32 s1, s12, s2 s_add_i32 s0, s1, s0 s_mul_i32 s1, s13, s2 s_add_i32 s1, s0, s1 s_mul_i32 s0, s12, s2 s_lshl_b64 s[0:1], s[0:1], 1 v_and_b32_e32 v3, 0xffff, v1 v_and_b32_e32 v1, 0xffff0000, v1 s_add_u32 s0, s3, s0 v_or3_b32 v1, v3, 0, v1 s_addc_u32 s1, s4, s1 v_lshlrev_b32_e32 v2, 1, v7 global_store_dwordx2 v2, v[0:1], s[0:1] .LBB7_23: s_endpgm .LBB7_24: s_movk_i32 s8, 0x90 v_mov_b32_e32 v4, 0x4f800000 v_cmp_class_f32_e64 s[8:9], v6, s8 v_cndmask_b32_e64 v4, 1.0, v4, s[8:9] v_mul_f32_e32 v4, v6, v4 v_mov_b32_e32 v6, 0xc1b17218 v_log_f32_e32 v4, v4 v_cndmask_b32_e64 v6, 0, v6, s[8:9] v_fmac_f32_e32 v6, 0x3f317218, v4 v_add_f32_e32 v4, v6, v5 s_or_b64 exec, exec, s[10:11] v_cmp_eq_u32_e64 s[8:9], 0, v0 s_and_saveexec_b64 s[10:11], s[8:9] s_cbranch_execz .LBB7_10 .LBB7_25: s_load_dwordx2 s[4:5], s[4:5], 0x60 s_lshl_b64 s[8:9], s[6:7], 2 v_mov_b32_e32 v5, 0 s_waitcnt lgkmcnt(0) s_add_u32 s4, s4, s8 s_addc_u32 s5, s5, s9 global_store_dword v5, v4, s[4:5] s_or_b64 exec, exec, s[10:11] s_and_saveexec_b64 s[4:5], vcc s_cbranch_execz .LBB7_11 .LBB7_26: v_sub_f32_e32 v2, v2, v4 s_mov_b32 s7, 0xc2aeac50 v_cmp_gt_f32_e32 vcc, s7, v2 v_mov_b32_e32 v6, 0x42800000 v_cndmask_b32_e32 v6, 0, v6, vcc v_add_f32_e32 v2, v2, v6 v_mul_f32_e32 v2, 0x3fb8aa3b, v2 v_mov_b32_e32 v5, 0x114b4ea4 v_exp_f32_e32 v2, v2 v_cndmask_b32_e32 v5, 1.0, v5, vcc v_mul_f32_e32 v2, v5, v2 v_lshlrev_b32_e32 v5, 2, v0 ds_write_b32 v5, v2 s_or_b64 exec, exec, s[4:5] s_and_saveexec_b64 s[4:5], s[0:1] s_cbranch_execz .LBB7_12 .LBB7_27: v_sub_f32_e32 v1, v1, v4 s_mov_b32 s0, 0xc2aeac50 v_cmp_gt_f32_e32 vcc, s0, v1 v_mov_b32_e32 v5, 0x42800000 v_cndmask_b32_e32 v5, 0, v5, vcc v_add_f32_e32 v1, v1, v5 v_mul_f32_e32 v1, 0x3fb8aa3b, v1 v_mov_b32_e32 v2, 0x114b4ea4 v_exp_f32_e32 v1, v1 v_cndmask_b32_e32 v2, 1.0, v2, vcc v_mul_f32_e32 v1, v2, v1 v_lshlrev_b32_e32 v2, 2, v0 ds_write_b32 v2, v1 offset:256 s_or_b64 exec, exec, s[4:5] s_and_b64 exec, exec, s[2:3] s_cbranch_execnz .LBB7_13 s_branch .LBB7_14 .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi144ELi128EEEv20Flash_fwd_mla_params .amdhsa_group_segment_fixed_size 576 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 336 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 52 .amdhsa_next_free_sgpr 35 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_bf16_overflow 0 .amdhsa_bf8_overflow 1 .amdhsa_fp8_overflow 1 .amdhsa_bf16_denorm_mode 3 .amdhsa_load_store_out_of_order 0 .amdhsa_matrix_excp 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi144ELi128EEEv20Flash_fwd_mla_params,#alloc,#execinstr .Lfunc_end7: .size _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi144ELi128EEEv20Flash_fwd_mla_params, .Lfunc_end7-_ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi144ELi128EEEv20Flash_fwd_mla_params ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 2828 ; NumSgprs: 39 ; NumVgprs: 52 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 576 bytes/workgroup (compile time only) ; SGPRBlocks: 4 ; VGPRBlocks: 12 ; NumSGPRsForWavesPerEU: 39 ; NumVGPRsForWavesPerEU: 52 ; Occupancy: 4 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF16_OVFL: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:FP8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:BF16_DENORM: 3 .section .text._ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi160ELi128EEEv20Flash_fwd_mla_params,#alloc,#execinstr .protected _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi160ELi128EEEv20Flash_fwd_mla_params ; -- Begin function _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi160ELi128EEEv20Flash_fwd_mla_params .globl _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi160ELi128EEEv20Flash_fwd_mla_params .p2align 8 .type _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi160ELi128EEEv20Flash_fwd_mla_params,@function _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi160ELi128EEEv20Flash_fwd_mla_params: ; @_ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi160ELi128EEEv20Flash_fwd_mla_params ; %bb.0: s_load_dword s30, s[4:5], 0x4 s_load_dword s31, s[4:5], 0x10 s_load_dwordx2 s[8:9], s[4:5], 0x120 s_ashr_i32 s7, s6, 31 s_waitcnt lgkmcnt(0) s_mul_i32 s31, s31, s30 s_ashr_i32 s0, s31, 31 s_add_i32 s1, s31, s0 s_xor_b32 s2, s1, s0 v_cvt_f32_u32_e32 v1, s2 s_xor_b32 s12, s7, s0 s_sub_i32 s0, 0, s2 s_add_i32 s1, s6, s7 v_rcp_iflag_f32_e32 v1, v1 s_xor_b32 s1, s1, s7 v_mul_f32_e32 v1, 0x4f7ffffe, v1 v_cvt_u32_f32_e32 v1, v1 v_readfirstlane_b32 s3, v1 s_mul_i32 s0, s0, s3 s_mul_hi_u32 s0, s3, s0 s_add_i32 s3, s3, s0 s_mul_hi_u32 s0, s1, s3 s_mul_i32 s3, s0, s2 s_sub_i32 s1, s1, s3 s_add_i32 s10, s0, 1 s_sub_i32 s3, s1, s2 s_cmp_ge_u32 s1, s2 s_cselect_b32 s13, s10, s0 s_cselect_b32 s3, s3, s1 s_add_i32 s14, s13, 1 s_cmp_ge_u32 s3, s2 s_cselect_b64 s[0:1], -1, 0 s_and_b64 s[10:11], s[0:1], exec s_cselect_b32 s10, s14, s13 s_xor_b32 s10, s10, s12 s_sub_i32 s14, s10, s12 s_ashr_i32 s15, s14, 31 s_lshl_b64 s[10:11], s[14:15], 2 s_add_u32 s8, s8, s10 s_addc_u32 s9, s9, s11 s_load_dwordx2 s[24:25], s[8:9], 0x0 s_waitcnt lgkmcnt(0) s_sub_i32 s33, s25, s24 s_cmp_eq_u32 s33, 1 s_cbranch_scc1 .LBB8_23 ; %bb.1: s_load_dwordx2 s[18:19], s[4:5], 0x58 s_load_dwordx2 s[20:21], s[4:5], 0x90 s_load_dwordx2 s[12:13], s[4:5], 0xc0 s_load_dwordx2 s[16:17], s[4:5], 0xf0 s_load_dwordx2 s[22:23], s[4:5], 0x130 s_sub_i32 s2, s3, s2 s_and_b64 s[0:1], s[0:1], exec s_cselect_b32 s0, s2, s3 s_xor_b32 s0, s0, s7 s_sub_i32 s0, s0, s7 s_mul_i32 s1, s24, s31 s_add_i32 s26, s0, s1 v_cmp_gt_u32_e32 vcc, 64, v0 s_and_saveexec_b64 s[28:29], vcc s_cbranch_execz .LBB8_14 ; %bb.2: ; %.preheader156.preheader s_load_dwordx2 s[0:1], s[4:5], 0x128 s_ashr_i32 s27, s26, 31 s_lshl_b64 s[2:3], s[26:27], 2 v_cmp_gt_i32_e32 vcc, s33, v0 v_mov_b32_e32 v1, 0xff800000 s_waitcnt lgkmcnt(0) s_add_u32 s27, s0, s2 s_addc_u32 s34, s1, s3 v_mov_b32_e32 v2, 0xff800000 s_and_saveexec_b64 s[2:3], vcc s_cbranch_execz .LBB8_4 ; %bb.3: v_mul_lo_u32 v2, v0, s31 v_mov_b32_e32 v4, s34 v_ashrrev_i32_e32 v3, 31, v2 v_lshlrev_b64 v[2:3], 2, v[2:3] v_add_co_u32_e64 v2, s[0:1], s27, v2 v_addc_co_u32_e64 v3, s[0:1], v4, v3, s[0:1] global_load_dword v2, v[2:3], off .LBB8_4: ; %.preheader156.1 s_or_b64 exec, exec, s[2:3] v_add_u32_e32 v3, 64, v0 v_cmp_gt_i32_e64 s[0:1], s33, v3 s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB8_6 ; %bb.5: v_mul_lo_u32 v3, v3, s31 v_mov_b32_e32 v1, s34 v_ashrrev_i32_e32 v4, 31, v3 v_lshlrev_b64 v[3:4], 2, v[3:4] v_add_co_u32_e64 v3, s[2:3], s27, v3 v_addc_co_u32_e64 v4, s[2:3], v1, v4, s[2:3] global_load_dword v1, v[3:4], off .LBB8_6: ; %.preheader156.2 s_or_b64 exec, exec, s[8:9] v_or_b32_e32 v4, 0x80, v0 v_cmp_gt_i32_e64 s[2:3], s33, v4 v_mov_b32_e32 v3, 0xff800000 s_and_saveexec_b64 s[10:11], s[2:3] s_cbranch_execz .LBB8_8 ; %bb.7: v_mul_lo_u32 v3, v4, s31 v_mov_b32_e32 v5, s34 v_ashrrev_i32_e32 v4, 31, v3 v_lshlrev_b64 v[3:4], 2, v[3:4] v_add_co_u32_e64 v3, s[8:9], s27, v3 v_addc_co_u32_e64 v4, s[8:9], v5, v4, s[8:9] global_load_dword v3, v[3:4], off .LBB8_8: ; %.preheader155 s_or_b64 exec, exec, s[10:11] v_mbcnt_lo_u32_b32 v5, -1, 0 v_mbcnt_hi_u32_b32 v5, -1, v5 v_and_b32_e32 v6, 63, v5 v_and_b32_e32 v7, 64, v5 v_add_u32_e32 v7, 64, v7 v_xor_b32_e32 v8, 32, v6 v_cmp_lt_i32_e64 s[8:9], v8, v7 s_waitcnt vmcnt(0) v_max_f32_e32 v4, 0xff800000, v2 v_cndmask_b32_e64 v8, v5, v8, s[8:9] v_max3_f32 v4, v4, v1, v3 v_lshlrev_b32_e32 v12, 2, v8 ds_bpermute_b32 v8, v12, v4 s_mov_b32 s10, 0xc2aeac50 v_mov_b32_e32 v10, 0x42800000 s_waitcnt lgkmcnt(0) v_max_f32_e32 v4, v4, v8 v_xor_b32_e32 v8, 16, v6 v_cmp_lt_i32_e64 s[8:9], v8, v7 v_cndmask_b32_e64 v8, v5, v8, s[8:9] v_lshlrev_b32_e32 v13, 2, v8 ds_bpermute_b32 v8, v13, v4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v4, v4, v8 v_xor_b32_e32 v8, 8, v6 v_cmp_lt_i32_e64 s[8:9], v8, v7 v_cndmask_b32_e64 v8, v5, v8, s[8:9] v_lshlrev_b32_e32 v14, 2, v8 ds_bpermute_b32 v8, v14, v4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v4, v4, v8 v_xor_b32_e32 v8, 4, v6 v_cmp_lt_i32_e64 s[8:9], v8, v7 v_cndmask_b32_e64 v8, v5, v8, s[8:9] v_lshlrev_b32_e32 v15, 2, v8 ds_bpermute_b32 v8, v15, v4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v4, v4, v8 v_xor_b32_e32 v8, 2, v6 v_cmp_lt_i32_e64 s[8:9], v8, v7 v_cndmask_b32_e64 v8, v5, v8, s[8:9] v_lshlrev_b32_e32 v16, 2, v8 ds_bpermute_b32 v8, v16, v4 v_xor_b32_e32 v6, 1, v6 v_cmp_lt_i32_e64 s[8:9], v6, v7 v_cndmask_b32_e64 v5, v5, v6, s[8:9] v_lshlrev_b32_e32 v17, 2, v5 s_waitcnt lgkmcnt(0) v_max_f32_e32 v4, v4, v8 ds_bpermute_b32 v5, v17, v4 s_mov_b32 s8, 0xff800000 v_mov_b32_e32 v8, 0x114b4ea4 s_waitcnt lgkmcnt(0) v_max_f32_e32 v4, v4, v5 v_cmp_lg_f32_e64 s[8:9], s8, v4 v_cndmask_b32_e64 v5, 0, v4, s[8:9] v_sub_f32_e32 v4, v2, v5 v_cmp_gt_f32_e64 s[8:9], s10, v4 v_cndmask_b32_e64 v6, 0, v10, s[8:9] v_add_f32_e32 v4, v4, v6 v_sub_f32_e32 v7, v1, v5 v_sub_f32_e32 v6, v3, v5 v_cndmask_b32_e64 v18, 1.0, v8, s[8:9] v_cmp_gt_f32_e64 s[8:9], s10, v7 v_cmp_gt_f32_e64 s[10:11], s10, v6 v_cndmask_b32_e64 v11, 0, v10, s[8:9] v_cndmask_b32_e64 v10, 0, v10, s[10:11] v_pk_add_f32 v[6:7], v[6:7], v[10:11] v_mov_b32_e32 v10, 0x3fb8aa3b v_mov_b32_e32 v11, v10 v_pk_mul_f32 v[6:7], v[6:7], v[10:11] v_mul_f32_e32 v4, 0x3fb8aa3b, v4 v_exp_f32_e32 v7, v7 v_exp_f32_e32 v6, v6 v_exp_f32_e32 v4, v4 v_cndmask_b32_e64 v9, 1.0, v8, s[8:9] v_cndmask_b32_e64 v8, 1.0, v8, s[10:11] v_pk_mul_f32 v[6:7], v[8:9], v[6:7] v_fma_f32 v4, v18, v4, v7 v_add_f32_e32 v4, v6, v4 ds_bpermute_b32 v6, v12, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v6 ds_bpermute_b32 v6, v13, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v6 ds_bpermute_b32 v6, v14, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v6 ds_bpermute_b32 v6, v15, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v6 ds_bpermute_b32 v6, v16, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v4, v4, v6 ds_bpermute_b32 v6, v17, v4 s_waitcnt lgkmcnt(0) v_add_f32_e32 v6, v4, v6 v_cmp_lg_f32_e64 s[8:9], 0, v6 v_mov_b32_e32 v4, 0x7f800000 s_and_saveexec_b64 s[10:11], s[8:9] s_cbranch_execnz .LBB8_24 ; %bb.9: s_or_b64 exec, exec, s[10:11] v_cmp_eq_u32_e64 s[8:9], 0, v0 s_and_saveexec_b64 s[10:11], s[8:9] s_cbranch_execnz .LBB8_25 .LBB8_10: s_or_b64 exec, exec, s[10:11] s_and_saveexec_b64 s[4:5], vcc s_cbranch_execnz .LBB8_26 .LBB8_11: s_or_b64 exec, exec, s[4:5] s_and_saveexec_b64 s[4:5], s[0:1] s_cbranch_execnz .LBB8_27 .LBB8_12: s_or_b64 exec, exec, s[4:5] s_and_b64 exec, exec, s[2:3] s_cbranch_execz .LBB8_14 .LBB8_13: v_sub_f32_e32 v1, v3, v4 s_mov_b32 s0, 0xc2aeac50 v_cmp_gt_f32_e32 vcc, s0, v1 v_mov_b32_e32 v3, 0x42800000 v_cndmask_b32_e32 v3, 0, v3, vcc v_add_f32_e32 v1, v1, v3 v_mul_f32_e32 v1, 0x3fb8aa3b, v1 v_mov_b32_e32 v2, 0x114b4ea4 v_exp_f32_e32 v1, v1 v_cndmask_b32_e32 v2, 1.0, v2, vcc v_mul_f32_e32 v1, v2, v1 v_lshlrev_b32_e32 v2, 2, v0 ds_write_b32 v2, v1 offset:512 .LBB8_14: ; %Flow245 s_or_b64 exec, exec, s[28:29] v_lshlrev_b32_e32 v7, 2, v0 s_mov_b32 s0, 0 s_cmp_lt_i32 s33, 1 v_mov_b32_e32 v2, 0 v_mov_b32_e32 v1, 0 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v3, 0 s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_cbranch_scc1 .LBB8_22 ; %bb.15: ; %.lr.ph s_lshl_b32 s2, s26, 9 s_ashr_i32 s3, s2, 31 s_lshl_b64 s[4:5], s[2:3], 2 s_add_u32 s7, s22, s4 s_addc_u32 s8, s23, s5 s_lshl_b32 s2, s31, 9 s_not_b32 s1, s24 s_ashr_i32 s3, s2, 31 s_add_i32 s9, s25, s1 s_cmp_lt_u32 s9, 7 s_cbranch_scc1 .LBB8_19 ; %bb.16: ; %.lr.ph.new v_lshlrev_b32_e32 v0, 4, v0 v_mov_b32_e32 v1, s23 v_add_co_u32_e32 v5, vcc, s22, v0 s_and_b32 s0, s33, -8 v_addc_co_u32_e32 v0, vcc, 0, v1, vcc s_lshl_b64 s[8:9], s[2:3], 5 s_mov_b32 s1, 0 s_mov_b32 s7, 0 v_mov_b32_e32 v3, 0 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v1, 0 v_mov_b32_e32 v2, 0 s_lshl_b64 s[10:11], s[2:3], 2 .LBB8_17: ; =>This Inner Loop Header: Depth=1 v_mov_b32_e32 v6, s5 v_add_co_u32_e32 v11, vcc, s4, v5 v_addc_co_u32_e32 v12, vcc, v0, v6, vcc v_mov_b32_e32 v38, s11 v_add_co_u32_e32 v20, vcc, s10, v11 v_addc_co_u32_e32 v21, vcc, v12, v38, vcc v_add_co_u32_e32 v22, vcc, s10, v20 v_addc_co_u32_e32 v23, vcc, v21, v38, vcc v_add_co_u32_e32 v28, vcc, s10, v22 v_addc_co_u32_e32 v29, vcc, v23, v38, vcc v_add_co_u32_e32 v30, vcc, s10, v28 global_load_dwordx4 v[8:11], v[11:12], off v_addc_co_u32_e32 v31, vcc, v29, v38, vcc v_add_co_u32_e32 v32, vcc, s10, v30 global_load_dwordx4 v[12:15], v[20:21], off global_load_dwordx4 v[16:19], v[22:23], off v_addc_co_u32_e32 v33, vcc, v31, v38, vcc v_add_co_u32_e32 v35, vcc, s10, v32 global_load_dwordx4 v[20:23], v[28:29], off global_load_dwordx4 v[24:27], v[30:31], off v_addc_co_u32_e32 v36, vcc, v33, v38, vcc global_load_dwordx4 v[28:31], v[32:33], off v_add_co_u32_e32 v37, vcc, s10, v35 global_load_dwordx4 v[32:35], v[35:36], off v_addc_co_u32_e32 v38, vcc, v36, v38, vcc global_load_dwordx4 v[36:39], v[37:38], off v_mov_b32_e32 v6, s1 ds_read_b128 v[40:43], v6 ds_read_b128 v[44:47], v6 offset:16 v_mov_b32_e32 v48, s9 v_add_co_u32_e32 v5, vcc, s8, v5 v_addc_co_u32_e32 v0, vcc, v0, v48, vcc s_waitcnt lgkmcnt(1) v_mov_b32_e32 v48, v40 v_mov_b32_e32 v49, v40 v_mov_b32_e32 v40, v41 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v50, v44 v_mov_b32_e32 v51, v44 v_mov_b32_e32 v44, v45 s_add_i32 s7, s7, 8 s_add_i32 s1, s1, 32 s_cmp_eq_u32 s0, s7 s_waitcnt vmcnt(7) v_pk_fma_f32 v[1:2], v[48:49], v[8:9], v[1:2] v_pk_fma_f32 v[3:4], v[10:11], v[48:49], v[3:4] s_waitcnt vmcnt(6) v_pk_fma_f32 v[1:2], v[40:41], v[12:13], v[1:2] v_pk_fma_f32 v[3:4], v[14:15], v[40:41], v[3:4] v_mov_b32_e32 v41, v42 s_waitcnt vmcnt(5) v_pk_fma_f32 v[1:2], v[41:42], v[16:17], v[1:2] v_pk_fma_f32 v[3:4], v[18:19], v[41:42], v[3:4] v_mov_b32_e32 v42, v43 s_waitcnt vmcnt(4) v_pk_fma_f32 v[1:2], v[42:43], v[20:21], v[1:2] v_pk_fma_f32 v[3:4], v[22:23], v[42:43], v[3:4] s_waitcnt vmcnt(3) v_pk_fma_f32 v[1:2], v[50:51], v[24:25], v[1:2] v_pk_fma_f32 v[3:4], v[26:27], v[50:51], v[3:4] s_waitcnt vmcnt(2) v_pk_fma_f32 v[1:2], v[44:45], v[28:29], v[1:2] v_pk_fma_f32 v[3:4], v[30:31], v[44:45], v[3:4] v_mov_b32_e32 v45, v46 s_waitcnt vmcnt(1) v_pk_fma_f32 v[1:2], v[45:46], v[32:33], v[1:2] v_pk_fma_f32 v[3:4], v[34:35], v[45:46], v[3:4] v_mov_b32_e32 v46, v47 s_waitcnt vmcnt(0) v_pk_fma_f32 v[1:2], v[46:47], v[36:37], v[1:2] v_pk_fma_f32 v[3:4], v[38:39], v[46:47], v[3:4] s_cbranch_scc0 .LBB8_17 ; %bb.18: ; %._crit_edge.loopexit.unr-lcssa.loopexit v_mov_b32_e32 v6, s5 v_add_co_u32_e32 v5, vcc, s4, v5 v_addc_co_u32_e32 v0, vcc, v0, v6, vcc s_and_b32 s4, s33, 7 s_cmp_eq_u32 s4, 0 s_cbranch_scc0 .LBB8_20 s_branch .LBB8_22 .LBB8_19: v_lshlrev_b32_e32 v0, 2, v7 s_mov_b32 s1, s0 v_mov_b32_e32 v3, s8 v_add_co_u32_e32 v5, vcc, s7, v0 v_mov_b64_e32 v[1:2], s[0:1] v_addc_co_u32_e32 v0, vcc, 0, v3, vcc v_mov_b64_e32 v[3:4], s[0:1] s_and_b32 s4, s33, 7 s_cmp_eq_u32 s4, 0 s_cbranch_scc1 .LBB8_22 .LBB8_20: ; %.epil.preheader.preheader v_add_co_u32_e32 v5, vcc, 8, v5 s_lshl_b32 s5, s0, 2 v_addc_co_u32_e32 v6, vcc, 0, v0, vcc s_lshl_b64 s[0:1], s[2:3], 2 .LBB8_21: ; %.epil.preheader ; =>This Inner Loop Header: Depth=1 global_load_dwordx4 v[8:11], v[5:6], off offset:-8 v_mov_b32_e32 v0, s5 ds_read_b32 v12, v0 v_mov_b32_e32 v13, s1 v_add_co_u32_e32 v5, vcc, s0, v5 s_add_i32 s5, s5, 4 s_add_i32 s4, s4, -1 v_addc_co_u32_e32 v6, vcc, v6, v13, vcc s_waitcnt lgkmcnt(0) v_mov_b32_e32 v13, v12 s_cmp_lg_u32 s4, 0 s_waitcnt vmcnt(0) v_pk_fma_f32 v[1:2], v[12:13], v[8:9], v[1:2] v_pk_fma_f32 v[3:4], v[10:11], v[12:13], v[3:4] s_cbranch_scc1 .LBB8_21 .LBB8_22: ; %._crit_edge v_cvt_pk_bf16_f32 v0, v1, v2 s_ashr_i32 s0, s30, 31 s_add_i32 s1, s30, s0 s_xor_b32 s1, s1, s0 v_and_b32_e32 v2, 0xffff0000, v0 v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD v_cvt_f32_u32_e32 v2, s1 s_mul_i32 s2, s14, s31 s_sub_i32 s2, s6, s2 s_ashr_i32 s3, s2, 31 v_rcp_iflag_f32_e32 v2, v2 s_add_i32 s4, s2, s3 s_xor_b32 s0, s3, s0 s_xor_b32 s3, s4, s3 v_mul_f32_e32 v2, 0x4f7ffffe, v2 v_cvt_u32_f32_e32 v2, v2 s_sub_i32 s4, 0, s1 v_cvt_pk_bf16_f32 v1, v3, v4 v_or3_b32 v0, 0, v0, 0 v_readfirstlane_b32 s5, v2 s_mul_i32 s4, s4, s5 s_mul_hi_u32 s4, s5, s4 s_add_i32 s5, s5, s4 s_mul_hi_u32 s4, s3, s5 s_mul_i32 s5, s4, s1 s_sub_i32 s3, s3, s5 s_add_i32 s5, s4, 1 s_sub_i32 s6, s3, s1 s_cmp_ge_u32 s3, s1 s_cselect_b32 s4, s5, s4 s_cselect_b32 s3, s6, s3 s_add_i32 s5, s4, 1 s_cmp_ge_u32 s3, s1 s_cselect_b32 s1, s5, s4 s_xor_b32 s1, s1, s0 s_sub_i32 s3, s1, s0 s_mul_i32 s0, s3, s30 s_sub_i32 s2, s2, s0 s_mul_i32 s0, s20, s15 s_mul_hi_u32 s1, s20, s14 s_add_i32 s0, s1, s0 s_mul_i32 s1, s21, s14 s_add_i32 s1, s0, s1 s_mul_i32 s0, s20, s14 s_lshl_b64 s[0:1], s[0:1], 1 s_add_u32 s4, s18, s0 s_addc_u32 s5, s19, s1 s_ashr_i32 s0, s3, 31 s_mul_i32 s0, s16, s0 s_mul_hi_u32 s1, s16, s3 s_add_i32 s0, s1, s0 s_mul_i32 s1, s17, s3 s_add_i32 s1, s0, s1 s_mul_i32 s0, s16, s3 s_lshl_b64 s[0:1], s[0:1], 1 s_add_u32 s3, s4, s0 s_addc_u32 s4, s5, s1 s_ashr_i32 s0, s2, 31 s_mul_i32 s0, s12, s0 s_mul_hi_u32 s1, s12, s2 s_add_i32 s0, s1, s0 s_mul_i32 s1, s13, s2 s_add_i32 s1, s0, s1 s_mul_i32 s0, s12, s2 s_lshl_b64 s[0:1], s[0:1], 1 v_and_b32_e32 v3, 0xffff, v1 v_and_b32_e32 v1, 0xffff0000, v1 s_add_u32 s0, s3, s0 v_or3_b32 v1, v3, 0, v1 s_addc_u32 s1, s4, s1 v_lshlrev_b32_e32 v2, 1, v7 global_store_dwordx2 v2, v[0:1], s[0:1] .LBB8_23: s_endpgm .LBB8_24: s_movk_i32 s8, 0x90 v_mov_b32_e32 v4, 0x4f800000 v_cmp_class_f32_e64 s[8:9], v6, s8 v_cndmask_b32_e64 v4, 1.0, v4, s[8:9] v_mul_f32_e32 v4, v6, v4 v_mov_b32_e32 v6, 0xc1b17218 v_log_f32_e32 v4, v4 v_cndmask_b32_e64 v6, 0, v6, s[8:9] v_fmac_f32_e32 v6, 0x3f317218, v4 v_add_f32_e32 v4, v6, v5 s_or_b64 exec, exec, s[10:11] v_cmp_eq_u32_e64 s[8:9], 0, v0 s_and_saveexec_b64 s[10:11], s[8:9] s_cbranch_execz .LBB8_10 .LBB8_25: s_load_dwordx2 s[4:5], s[4:5], 0x60 s_lshl_b64 s[8:9], s[6:7], 2 v_mov_b32_e32 v5, 0 s_waitcnt lgkmcnt(0) s_add_u32 s4, s4, s8 s_addc_u32 s5, s5, s9 global_store_dword v5, v4, s[4:5] s_or_b64 exec, exec, s[10:11] s_and_saveexec_b64 s[4:5], vcc s_cbranch_execz .LBB8_11 .LBB8_26: v_sub_f32_e32 v2, v2, v4 s_mov_b32 s7, 0xc2aeac50 v_cmp_gt_f32_e32 vcc, s7, v2 v_mov_b32_e32 v6, 0x42800000 v_cndmask_b32_e32 v6, 0, v6, vcc v_add_f32_e32 v2, v2, v6 v_mul_f32_e32 v2, 0x3fb8aa3b, v2 v_mov_b32_e32 v5, 0x114b4ea4 v_exp_f32_e32 v2, v2 v_cndmask_b32_e32 v5, 1.0, v5, vcc v_mul_f32_e32 v2, v5, v2 v_lshlrev_b32_e32 v5, 2, v0 ds_write_b32 v5, v2 s_or_b64 exec, exec, s[4:5] s_and_saveexec_b64 s[4:5], s[0:1] s_cbranch_execz .LBB8_12 .LBB8_27: v_sub_f32_e32 v1, v1, v4 s_mov_b32 s0, 0xc2aeac50 v_cmp_gt_f32_e32 vcc, s0, v1 v_mov_b32_e32 v5, 0x42800000 v_cndmask_b32_e32 v5, 0, v5, vcc v_add_f32_e32 v1, v1, v5 v_mul_f32_e32 v1, 0x3fb8aa3b, v1 v_mov_b32_e32 v2, 0x114b4ea4 v_exp_f32_e32 v1, v1 v_cndmask_b32_e32 v2, 1.0, v2, vcc v_mul_f32_e32 v1, v2, v1 v_lshlrev_b32_e32 v2, 2, v0 ds_write_b32 v2, v1 offset:256 s_or_b64 exec, exec, s[4:5] s_and_b64 exec, exec, s[2:3] s_cbranch_execnz .LBB8_13 s_branch .LBB8_14 .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi160ELi128EEEv20Flash_fwd_mla_params .amdhsa_group_segment_fixed_size 640 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 336 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 52 .amdhsa_next_free_sgpr 35 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_bf16_overflow 0 .amdhsa_bf8_overflow 1 .amdhsa_fp8_overflow 1 .amdhsa_bf16_denorm_mode 3 .amdhsa_load_store_out_of_order 0 .amdhsa_matrix_excp 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi160ELi128EEEv20Flash_fwd_mla_params,#alloc,#execinstr .Lfunc_end8: .size _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi160ELi128EEEv20Flash_fwd_mla_params, .Lfunc_end8-_ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi160ELi128EEEv20Flash_fwd_mla_params ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 2828 ; NumSgprs: 39 ; NumVgprs: 52 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 640 bytes/workgroup (compile time only) ; SGPRBlocks: 4 ; VGPRBlocks: 12 ; NumSGPRsForWavesPerEU: 39 ; NumVGPRsForWavesPerEU: 52 ; Occupancy: 4 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF16_OVFL: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:FP8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:BF16_DENORM: 3 .section .text._ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params,#alloc,#execinstr .protected _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params ; -- Begin function _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params .globl _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params .p2align 8 .type _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params,@function _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params: ; @_ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params ; %bb.0: s_load_dwordx2 s[0:1], s[4:5], 0x110 s_load_dwordx2 s[34:35], s[4:5], 0x0 s_lshl_b32 s2, s8, 3 s_ashr_i32 s3, s2, 31 s_lshl_b64 s[2:3], s[2:3], 2 s_waitcnt lgkmcnt(0) s_add_u32 s0, s0, s2 s_addc_u32 s1, s1, s3 s_load_dwordx4 s[16:19], s[0:1], 0x0 s_waitcnt lgkmcnt(0) s_cmp_ge_i32 s16, s34 s_cbranch_scc1 .LBB9_42 ; %bb.1: s_cmp_gt_i32 s16, s18 s_cbranch_scc1 .LBB9_42 ; %bb.2: ; %.lr.ph s_load_dwordx4 s[20:23], s[4:5], 0x28 s_load_dwordx2 s[2:3], s[4:5], 0x48 s_load_dwordx2 s[52:53], s[4:5], 0x68 s_load_dwordx4 s[24:27], s[4:5], 0x58 s_load_dwordx2 s[54:55], s[4:5], 0x80 s_load_dwordx4 s[28:31], s[4:5], 0x90 s_load_dwordx2 s[56:57], s[4:5], 0x130 s_load_dwordx4 s[12:15], s[4:5], 0x140 s_load_dword s33, s[4:5], 0xb0 s_load_dwordx4 s[8:11], s[4:5], 0xc0 s_load_dwordx4 s[36:39], s[4:5], 0xc s_load_dwordx2 s[40:41], s[4:5], 0x20 s_waitcnt lgkmcnt(0) s_load_dword s42, s[12:13], 0x0 s_load_dword s34, s[14:15], 0x0 s_ashr_i32 s12, s17, 31 s_lshr_b32 s12, s12, 26 s_lshl_b32 s6, s6, 5 s_add_i32 s12, s17, s12 s_ashr_i32 s44, s6, 31 s_ashr_i32 s45, s7, 31 s_ashr_i32 s17, s12, 6 s_mul_i32 s12, s30, s44 s_mul_hi_u32 s13, s30, s6 s_mul_hi_u32 s14, s10, s7 s_mul_i32 s15, s10, s45 s_add_i32 s12, s13, s12 s_mul_i32 s13, s31, s6 s_add_i32 s14, s14, s15 s_mul_i32 s11, s11, s7 s_add_i32 s12, s12, s13 s_mul_i32 s13, s30, s6 s_add_i32 s14, s14, s11 s_mul_i32 s10, s10, s7 s_add_u32 s31, s10, s13 s_addc_u32 s68, s14, s12 s_ashr_i32 s43, s38, 31 s_add_i32 s10, s38, s43 s_xor_b32 s38, s10, s43 v_cvt_f32_u32_e32 v1, s38 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v2, s42 v_mul_f32_e32 v2, s34, v2 v_mul_f32_e32 v168, s41, v2 v_rcp_iflag_f32_e32 v1, v1 s_sub_i32 s41, 0, s38 v_mul_f32_e32 v167, s40, v2 s_add_i32 s40, s7, s45 v_mul_f32_e32 v1, 0x4f7ffffe, v1 v_cvt_u32_f32_e32 v1, v1 s_xor_b32 s40, s40, s45 s_xor_b32 s46, s45, s43 v_lshlrev_b32_e32 v5, 4, v0 v_readfirstlane_b32 s42, v1 s_mul_i32 s41, s41, s42 s_mul_hi_u32 s41, s42, s41 s_add_i32 s42, s42, s41 s_mul_hi_u32 s41, s40, s42 s_mul_i32 s42, s41, s38 s_sub_i32 s40, s40, s42 s_add_i32 s42, s41, 1 s_sub_i32 s43, s40, s38 s_cmp_ge_u32 s40, s38 s_cselect_b32 s47, s42, s41 s_cselect_b32 s40, s43, s40 s_add_i32 s48, s47, 1 s_cmp_ge_u32 s40, s38 v_lshlrev_b32_e32 v9, 1, v0 s_load_dwordx2 s[10:11], s[4:5], 0xe0 s_load_dwordx4 s[12:15], s[4:5], 0xf0 s_load_dwordx2 s[58:59], s[4:5], 0x100 s_load_dwordx4 s[40:43], s[4:5], 0x120 s_load_dword s38, s[0:1], 0x10 s_cselect_b32 s0, s48, s47 v_and_b32_e32 v8, 0x3f0, v5 v_and_b32_e32 v9, 0x70, v9 s_xor_b32 s0, s0, s46 v_lshrrev_b32_e32 v2, 7, v0 v_xor_b32_e32 v8, v8, v9 s_sub_i32 s0, s0, s46 v_and_b32_e32 v1, 63, v0 v_lshl_or_b32 v8, v2, 12, v8 s_ashr_i32 s1, s0, 31 v_add_u32_e32 v171, 0, v8 v_lshrrev_b32_e32 v8, 3, v0 v_cmp_lt_u32_e32 vcc, 31, v1 s_waitcnt lgkmcnt(0) s_mul_i32 s1, s10, s1 s_mul_hi_u32 s4, s10, s0 v_lshl_add_u32 v4, v1, 4, 0 v_xor_b32_e32 v8, v8, v0 v_bfe_u32 v9, v0, 2, 4 v_cndmask_b32_e64 v1, 0, 1, vcc s_add_i32 s1, s4, s1 s_mul_i32 s4, s11, s0 v_xor_b32_e32 v172, v9, v1 v_lshlrev_b32_e32 v1, 4, v8 s_add_i32 s1, s1, s4 s_mul_i32 s0, s10, s0 v_and_b32_e32 v173, 48, v1 v_lshrrev_b32_e32 v1, 2, v0 s_add_u32 s69, s2, s0 v_and_b32_e32 v169, 15, v0 v_and_b32_e32 v170, 48, v0 v_lshlrev_b32_e32 v3, 6, v0 s_movk_i32 s0, 0x3c0 v_lshlrev_b32_e32 v7, 3, v0 v_and_b32_e32 v8, 12, v1 v_and_or_b32 v3, v3, s0, v170 v_and_b32_e32 v7, 0x70, v7 v_lshl_or_b32 v182, v2, 4, v8 v_and_or_b32 v8, v1, 16, v169 v_xor_b32_e32 v3, v3, v7 v_lshlrev_b32_e32 v1, 3, v8 s_addc_u32 s70, s3, s1 v_lshl_or_b32 v7, v2, 10, v3 v_lshrrev_b32_e32 v3, 5, v0 v_lshl_or_b32 v10, v2, 2, v1 s_add_i32 s2, 0, 0xaa00 v_and_b32_e32 v6, 0x400, v5 v_or_b32_e32 v3, v5, v3 v_add_u32_e32 v183, s2, v10 v_add_u32_e32 v184, s2, v1 s_movk_i32 s2, 0x3f4 v_and_or_b32 v12, v3, s2, v6 s_add_i32 s2, 0, 0xa800 v_add_u32_e32 v194, s2, v1 v_bfe_u32 v1, v0, 4, 2 v_add_u32_e32 v193, s2, v10 v_lshl_or_b32 v10, v2, 6, v1 v_lshrrev_b32_e32 v1, 1, v0 v_and_or_b32 v163, v1, 64, v170 v_mov_b32_e32 v164, 0 v_mad_u64_u32 v[1:2], s[2:3], s8, v8, v[163:164] s_sub_i32 s71, s35, s6 v_and_b32_e32 v11, 0xb0, v0 v_mad_u64_u32 v[2:3], s[2:3], s9, v8, v[2:3] v_cmp_eq_u32_e64 s[0:1], 0, v11 v_lshlrev_b32_e32 v11, 2, v8 v_cmp_eq_u32_e32 vcc, 0, v10 v_cmp_gt_i32_e64 s[2:3], s71, v8 s_mul_i32 s4, s8, s44 s_mul_hi_u32 s5, s8, s6 s_and_b64 s[60:61], vcc, s[2:3] v_mov_b32_e32 v3, s43 v_add_co_u32_e32 v195, vcc, s42, v11 s_mul_i32 s10, s9, s6 s_add_i32 s4, s5, s4 s_mul_hi_u32 s5, s12, s7 s_mul_i32 s9, s12, s45 v_addc_co_u32_e32 v196, vcc, 0, v3, vcc s_add_i32 s5, s5, s9 s_mul_i32 s9, s13, s7 v_mov_b32_e32 v3, s27 v_add_co_u32_e32 v198, vcc, s26, v11 s_mul_i32 s8, s8, s6 s_add_i32 s4, s4, s10 s_add_i32 s5, s5, s9 s_mul_i32 s9, s12, s7 v_addc_co_u32_e32 v199, vcc, 0, v3, vcc v_lshlrev_b32_e32 v3, 9, v8 v_or_b32_e32 v9, s6, v8 s_add_u32 s72, s9, s8 v_add_u32_e32 v197, v4, v6 v_lshrrev_b32_e32 v201, 6, v0 v_and_b32_e32 v0, 0x7f0, v5 v_or_b32_e32 v4, v3, v163 v_or_b32_e32 v5, 0x80, v163 v_or_b32_e32 v6, 0x100, v163 v_add_u32_e32 v207, 0, v7 v_or_b32_e32 v7, 0x180, v163 v_add_u32_e32 v8, 0, v12 v_lshlrev_b64 v[165:166], 1, v[1:2] v_mbcnt_lo_u32_b32 v226, -1, 0 v_or_b32_e32 v174, 64, v173 v_or_b32_e32 v175, 0x80, v173 v_or_b32_e32 v176, 0xc0, v173 v_or_b32_e32 v177, 0x100, v173 v_or_b32_e32 v178, 0x140, v173 v_or_b32_e32 v179, 0x180, v173 v_or_b32_e32 v180, 0x1c0, v173 v_or_b32_e32 v181, 0x200, v173 v_sub_u32_e32 v185, v184, v11 v_or_b32_e32 v186, 1, v182 v_or_b32_e32 v187, 2, v182 v_or_b32_e32 v188, 3, v182 v_or_b32_e32 v189, 32, v182 v_or_b32_e32 v190, 33, v182 v_or_b32_e32 v191, 34, v182 v_or_b32_e32 v192, 35, v182 s_addc_u32 s73, s5, s4 v_add_u32_e32 v200, 0x400, v171 s_mov_b32 s44, 0 v_add_u32_e32 v202, 0x800, v171 v_add_u32_e32 v203, 0xc00, v171 v_cmp_eq_u32_e64 s[4:5], 0, v170 v_xad_u32 v204, v9, -1, s35 v_sub_u32_e32 v205, v194, v11 v_add_u32_e32 v206, 0x2000, v171 s_mov_b32 s51, 0x20000 s_brev_b32 s50, 1 s_mov_b32 s74, 0xff800000 v_add_u32_e32 v208, 0, v0 v_lshlrev_b32_e32 v209, 2, v4 v_add_lshl_u32 v210, v3, v5, 2 v_add_lshl_u32 v211, v3, v6, 2 v_add_lshl_u32 v212, v3, v7, 2 v_add_lshl_u32 v213, v3, v163, 2 v_add_u32_e32 v214, 0xa000, v8 v_add_u32_e32 v215, 0x2400, v171 v_add_u32_e32 v216, 0x2800, v171 s_mov_b32 s26, s16 v_add_u32_e32 v217, 0x2c00, v171 v_add_u32_e32 v218, 0x4000, v171 v_add_u32_e32 v219, 0x4400, v171 v_add_u32_e32 v220, 0x4800, v171 v_add_u32_e32 v221, 0x4c00, v171 v_add_u32_e32 v222, 0x6000, v171 v_add_u32_e32 v223, 0x6400, v171 v_add_u32_e32 v224, 0x6800, v171 v_add_u32_e32 v225, 0x6c00, v171 v_mbcnt_hi_u32_b32 v227, -1, v226 s_branch .LBB9_5 .LBB9_3: ; %Flow911 ; in Loop: Header=BB9_5 Depth=1 s_or_b64 exec, exec, s[8:9] .LBB9_4: ; %_ZN5flash49compute_attn_1rowblock_splitkv_mla_fp8_gfx938_TP4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEvRK20Flash_fwd_mla_paramsiiiiiiibRT1_fff.exit ; in Loop: Header=BB9_5 Depth=1 s_add_i32 s8, s26, 1 s_cmp_lt_i32 s26, s18 s_mov_b32 s26, s8 s_cbranch_scc0 .LBB9_42 .LBB9_5: ; =>This Loop Header: Depth=1 ; Child Loop BB9_9 Depth 2 s_ashr_i32 s27, s26, 31 s_lshl_b64 s[62:63], s[26:27], 2 s_add_u32 s8, s20, s62 s_addc_u32 s9, s21, s63 global_load_dword v0, v164, s[8:9] s_cmp_le_i32 s26, s16 s_waitcnt vmcnt(0) v_readfirstlane_b32 s76, v0 s_cbranch_scc1 .LBB9_7 ; %bb.6: ; in Loop: Header=BB9_5 Depth=1 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) .LBB9_7: ; in Loop: Header=BB9_5 Depth=1 s_cmp_eq_u32 s26, s16 s_cselect_b64 s[42:43], -1, 0 s_and_b64 s[8:9], s[42:43], exec s_cselect_b32 s75, s17, 0 s_cmp_eq_u32 s26, s18 s_cselect_b32 s8, s19, s76 s_add_i32 s8, s8, 63 s_ashr_i32 s9, s8, 31 s_lshr_b32 s9, s9, 26 s_add_i32 s8, s8, s9 s_ashr_i32 s64, s8, 6 s_mul_i32 s8, s26, s53 s_mul_hi_u32 s9, s26, s52 s_add_i32 s8, s9, s8 s_mul_i32 s9, s27, s52 s_add_i32 s76, s76, 63 s_add_i32 s8, s8, s9 s_mul_i32 s9, s26, s52 s_add_u32 s9, s31, s9 s_addc_u32 s8, s68, s8 s_add_u32 s48, s22, s9 s_addc_u32 s49, s23, s8 v_readfirstlane_b32 s8, v201 s_lshr_b32 s9, s8, 31 s_add_i32 s9, s8, s9 s_ashr_i32 s10, s9, 1 s_and_b32 s9, s9, -2 s_sub_i32 s11, s8, s9 v_lshl_or_b32 v3, s11, 4, v169 v_lshl_or_b32 v0, s10, 6, v170 v_mad_u64_u32 v[1:2], s[8:9], v3, s30, v[0:1] s_lshl_b32 s8, s11, 10 s_lshl_b32 s9, s10, 11 v_cmp_gt_i32_e32 vcc, s71, v3 s_add_i32 s8, s8, s9 v_cndmask_b32_e32 v2, -1, v1, vcc s_add_i32 s10, s8, 0 ;;#ASMSTART s_mov_b32 m0, s10 buffer_load_dwordx4 v2, s[48:51], s44 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v2, 0x80, v1 v_cndmask_b32_e32 v2, -1, v2, vcc s_add_i32 s8, s10, 0x1000 ;;#ASMSTART s_mov_b32 m0, s8 buffer_load_dwordx4 v2, s[48:51], s44 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v2, 0x100, v1 v_cndmask_b32_e32 v2, -1, v2, vcc s_add_i32 s8, s10, 0x2000 ;;#ASMSTART s_mov_b32 m0, s8 buffer_load_dwordx4 v2, s[48:51], s44 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v2, 0x180, v1 v_cndmask_b32_e32 v2, -1, v2, vcc s_add_i32 s8, s10, 0x3000 ;;#ASMSTART s_mov_b32 m0, s8 buffer_load_dwordx4 v2, s[48:51], s44 ,offen offset:0, lds ;;#ASMEND v_cmp_gt_i32_e64 s[8:9], 64, v0 v_add_u32_e32 v1, 0x200, v1 s_and_b64 vcc, s[8:9], vcc v_cndmask_b32_e32 v0, -1, v1, vcc s_addk_i32 s10, 0x4000 ;;#ASMSTART s_mov_b32 m0, s10 buffer_load_dwordx4 v0, s[48:51], s44 ,offen offset:0, lds ;;#ASMEND ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND ds_read_b128 v[64:67], v197 ds_read_b128 v[68:71], v197 offset:2048 ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_b128 v[72:75], v197 offset:4096 ds_read_b128 v[76:79], v197 offset:6144 ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_b128 v[80:83], v197 offset:8192 ds_read_b128 v[84:87], v197 offset:10240 ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_b128 v[88:91], v197 offset:12288 ds_read_b128 v[92:95], v197 offset:14336 ;;#ASMSTART s_waitcnt vmcnt(0) s_barrier ;;#ASMEND ds_read_b128 v[96:99], v197 offset:16384 v_mov_b32_e32 v63, 0 s_cmp_le_i32 s64, s75 v_mov_b32_e32 v62, v63 v_mov_b32_e32 v61, v63 v_mov_b32_e32 v60, v63 v_mov_b32_e32 v59, v63 v_mov_b32_e32 v58, v63 v_mov_b32_e32 v57, v63 v_mov_b32_e32 v56, v63 v_mov_b32_e32 v55, v63 v_mov_b32_e32 v54, v63 v_mov_b32_e32 v53, v63 v_mov_b32_e32 v52, v63 v_mov_b32_e32 v51, v63 v_mov_b32_e32 v50, v63 v_mov_b32_e32 v49, v63 v_mov_b32_e32 v48, v63 v_mov_b32_e32 v47, v63 v_mov_b32_e32 v46, v63 v_mov_b32_e32 v45, v63 v_mov_b32_e32 v44, v63 v_mov_b32_e32 v43, v63 v_mov_b32_e32 v42, v63 v_mov_b32_e32 v41, v63 v_mov_b32_e32 v40, v63 v_mov_b32_e32 v39, v63 v_mov_b32_e32 v38, v63 v_mov_b32_e32 v37, v63 v_mov_b32_e32 v36, v63 v_mov_b32_e32 v35, v63 v_mov_b32_e32 v34, v63 v_mov_b32_e32 v33, v63 v_mov_b32_e32 v32, v63 v_mov_b32_e32 v31, v63 v_mov_b32_e32 v30, v63 v_mov_b32_e32 v29, v63 v_mov_b32_e32 v28, v63 v_mov_b32_e32 v27, v63 v_mov_b32_e32 v26, v63 v_mov_b32_e32 v25, v63 v_mov_b32_e32 v24, v63 v_mov_b32_e32 v23, v63 v_mov_b32_e32 v22, v63 v_mov_b32_e32 v21, v63 v_mov_b32_e32 v20, v63 v_mov_b32_e32 v19, v63 v_mov_b32_e32 v18, v63 v_mov_b32_e32 v17, v63 v_mov_b32_e32 v16, v63 v_mov_b32_e32 v15, v63 v_mov_b32_e32 v14, v63 v_mov_b32_e32 v13, v63 v_mov_b32_e32 v12, v63 v_mov_b32_e32 v11, v63 v_mov_b32_e32 v10, v63 v_mov_b32_e32 v9, v63 v_mov_b32_e32 v8, v63 v_mov_b32_e32 v7, v63 v_mov_b32_e32 v6, v63 v_mov_b32_e32 v5, v63 v_mov_b32_e32 v4, v63 v_mov_b32_e32 v3, v63 v_mov_b32_e32 v2, v63 v_mov_b32_e32 v1, v63 v_mov_b32_e32 v0, v63 v_mov_b32_e32 v228, v63 v_mov_b32_e32 v229, v63 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_cbranch_scc1 .LBB9_23 ; %bb.8: ; %.lr.ph.i.preheader ; in Loop: Header=BB9_5 Depth=1 s_mul_i32 s8, s26, s59 s_mul_hi_u32 s9, s26, s58 s_add_i32 s8, s9, s8 s_mul_i32 s9, s27, s58 s_add_i32 s9, s8, s9 s_mul_i32 s8, s26, s58 s_lshl_b64 s[8:9], s[8:9], 2 s_add_u32 s10, s14, s8 s_addc_u32 s11, s15, s9 s_lshl_b32 s8, s64, 6 s_ashr_i32 s65, s64, 31 s_sub_i32 s77, s76, s8 s_lshl_b64 s[8:9], s[64:65], 2 s_add_u32 s8, s10, s8 s_addc_u32 s9, s11, s9 s_add_u32 s65, s8, -4 v_mov_b32_e32 v228, 0 s_addc_u32 s78, s9, -1 s_mov_b64 s[66:67], 0 s_mov_b32 s79, s64 v_mov_b32_e32 v229, 0 v_mov_b32_e32 v0, 0 v_mov_b32_e32 v1, v228 v_mov_b32_e32 v2, v228 v_mov_b32_e32 v3, v228 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v5, v228 v_mov_b32_e32 v6, v228 v_mov_b32_e32 v7, v228 v_mov_b32_e32 v8, 0 v_mov_b32_e32 v9, v228 v_mov_b32_e32 v10, v228 v_mov_b32_e32 v11, v228 v_mov_b32_e32 v12, 0 v_mov_b32_e32 v13, v228 v_mov_b32_e32 v14, v228 v_mov_b32_e32 v15, v228 v_mov_b32_e32 v16, 0 v_mov_b32_e32 v17, v228 v_mov_b32_e32 v18, v228 v_mov_b32_e32 v19, v228 v_mov_b32_e32 v20, 0 v_mov_b32_e32 v21, v228 v_mov_b32_e32 v22, v228 v_mov_b32_e32 v23, v228 v_mov_b32_e32 v24, 0 v_mov_b32_e32 v25, v228 v_mov_b32_e32 v26, v228 v_mov_b32_e32 v27, v228 v_mov_b32_e32 v28, 0 v_mov_b32_e32 v29, v228 v_mov_b32_e32 v30, v228 v_mov_b32_e32 v31, v228 v_mov_b32_e32 v32, 0 v_mov_b32_e32 v33, v228 v_mov_b32_e32 v34, v228 v_mov_b32_e32 v35, v228 v_mov_b32_e32 v36, 0 v_mov_b32_e32 v37, v228 v_mov_b32_e32 v38, v228 v_mov_b32_e32 v39, v228 v_mov_b32_e32 v40, 0 v_mov_b32_e32 v41, v228 v_mov_b32_e32 v42, v228 v_mov_b32_e32 v43, v228 v_mov_b32_e32 v44, 0 v_mov_b32_e32 v45, v228 v_mov_b32_e32 v46, v228 v_mov_b32_e32 v47, v228 v_mov_b32_e32 v48, 0 v_mov_b32_e32 v49, v228 v_mov_b32_e32 v50, v228 v_mov_b32_e32 v51, v228 v_mov_b32_e32 v52, 0 v_mov_b32_e32 v53, v228 v_mov_b32_e32 v54, v228 v_mov_b32_e32 v55, v228 v_mov_b32_e32 v56, 0 v_mov_b32_e32 v57, v228 v_mov_b32_e32 v58, v228 v_mov_b32_e32 v59, v228 v_mov_b32_e32 v60, 0 v_mov_b32_e32 v61, v228 v_mov_b32_e32 v62, v228 v_mov_b32_e32 v63, v228 ; implicit-def: $vgpr231 ; implicit-def: $vgpr230 .LBB9_9: ; %.lr.ph.i ; Parent Loop BB9_5 Depth=1 ; => This Inner Loop Header: Depth=2 s_add_u32 s8, s65, s66 s_addc_u32 s9, s78, s67 ;;#ASMSTART s_load_dword s10, s[8:9], 0x0 s_waitcnt lgkmcnt(0) ;;#ASMEND s_ashr_i32 s8, s10, 31 s_mul_i32 s9, s10, s55 s_mul_hi_u32 s11, s10, s54 s_add_i32 s9, s11, s9 s_mul_i32 s8, s8, s54 s_add_i32 s9, s9, s8 v_readfirstlane_b32 s8, v201 v_lshl_or_b32 v100, s8, 4, v172 v_mul_lo_u32 v101, v100, s33 s_mul_i32 s10, s10, s54 s_add_u32 s48, s69, s10 s_addc_u32 s49, s70, s9 s_add_i32 s9, s77, 1 v_add_u32_e32 v102, v101, v173 v_cmp_gt_i32_e32 vcc, s9, v100 s_lshl_b32 s8, s8, 10 v_cndmask_b32_e32 v100, -1, v102, vcc s_add_i32 s8, s8, 0 ;;#ASMSTART s_mov_b32 m0, s8 buffer_load_dwordx4 v100, s[48:51], s44 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v174 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s9, s8, 0x1000 ;;#ASMSTART s_mov_b32 m0, s9 buffer_load_dwordx4 v100, s[48:51], s44 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v175 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s9, s8, 0x2000 ;;#ASMSTART s_mov_b32 m0, s9 buffer_load_dwordx4 v100, s[48:51], s44 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v176 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s9, s8, 0x3000 ;;#ASMSTART s_mov_b32 m0, s9 buffer_load_dwordx4 v100, s[48:51], s44 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v177 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s9, s8, 0x4000 ;;#ASMSTART s_mov_b32 m0, s9 buffer_load_dwordx4 v100, s[48:51], s44 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v178 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s9, s8, 0x5000 ;;#ASMSTART s_mov_b32 m0, s9 buffer_load_dwordx4 v100, s[48:51], s44 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v179 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s9, s8, 0x6000 ;;#ASMSTART s_mov_b32 m0, s9 buffer_load_dwordx4 v100, s[48:51], s44 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v180 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s9, s8, 0x7000 ;;#ASMSTART s_mov_b32 m0, s9 buffer_load_dwordx4 v100, s[48:51], s44 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v181 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s8, s8, 0x8000 ;;#ASMSTART s_mov_b32 m0, s8 buffer_load_dwordx4 v100, s[48:51], s44 ,offen offset:0, lds ;;#ASMEND ;;#ASMSTART s_waitcnt vmcnt(8) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 ds_read_b128 v[104:107], v207 offset:2048 s_mov_b32 s46, s44 s_mov_b32 s47, s44 s_mov_b32 s45, s44 v_mov_b64_e32 v[110:111], s[46:47] v_mov_b64_e32 v[108:109], s[44:45] v_mov_b64_e32 v[114:115], v[110:111] v_mov_b64_e32 v[112:113], v[108:109] s_waitcnt lgkmcnt(1) s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[64:65], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[64:65], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[66:67], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[66:67], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(7) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:4096 ds_read_b128 v[104:107], v207 offset:6144 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[68:69], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[68:69], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[70:71], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[70:71], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(6) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:8192 ds_read_b128 v[104:107], v207 offset:10240 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[72:73], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[72:73], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[74:75], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[74:75], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(5) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:12288 ds_read_b128 v[104:107], v207 offset:14336 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[76:77], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[76:77], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[78:79], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[78:79], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:16384 ds_read_b128 v[104:107], v207 offset:18432 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[80:81], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[80:81], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[82:83], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[82:83], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:20480 ds_read_b128 v[104:107], v207 offset:22528 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[84:85], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[84:85], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[86:87], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[86:87], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:24576 ds_read_b128 v[104:107], v207 offset:26624 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[88:89], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[88:89], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[90:91], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[90:91], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:28672 ds_read_b128 v[104:107], v207 offset:30720 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[92:93], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[92:93], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[94:95], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[94:95], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(0) s_barrier ;;#ASMEND s_ashr_i32 s8, s39, 31 s_add_i32 s9, s8, s39 s_xor_b32 s9, s9, s8 v_cvt_f32_u32_e32 v100, s9 v_ashrrev_i32_e32 v118, 31, v204 v_add_u32_e32 v120, v118, v204 v_xor_b32_e32 v120, v120, v118 v_rcp_iflag_f32_e32 v100, v100 v_mov_b32_e32 v119, 0xff800000 s_cmp_lg_u32 s66, 0 v_mul_f32_e32 v100, 0x4f7ffffe, v100 v_cvt_u32_f32_e32 v116, v100 ds_read_b128 v[100:103], v207 offset:32768 ds_read_b128 v[104:107], v207 offset:34816 v_mul_lo_u32 v117, s9, v116 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[96:97], v[100:101], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[98:99], v[102:103], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[96:97], v[104:105], v[108:111] lit v_sub_u32_e32 v117, 0, v117 v_mul_hi_u32 v117, v117, v116 v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[98:99], v[106:107], v[108:111] lit v_add_u32_e32 v116, v117, v116 v_mul_hi_u32 v116, v116, v120 v_xor_b32_e32 v117, s8, v118 v_mul_lo_u32 v100, v116, s9 v_add_u32_e32 v101, 1, v116 v_sub_u32_e32 v100, v120, v100 v_cmp_le_u32_e32 vcc, s9, v100 v_subrev_u32_e32 v102, s9, v100 v_cndmask_b32_e32 v101, v116, v101, vcc v_cndmask_b32_e32 v100, v100, v102, vcc v_add_u32_e32 v102, 1, v101 v_cmp_le_u32_e32 vcc, s9, v100 v_cndmask_b32_e32 v100, v101, v102, vcc v_xor_b32_e32 v100, v100, v117 v_sub_u32_e32 v100, v117, v100 v_add_u32_e32 v104, s77, v100 v_cmp_le_i32_e32 vcc, v182, v104 v_mov_b64_e32 v[100:101], v[108:109] v_cndmask_b32_e32 v241, v119, v112, vcc v_mov_b64_e32 v[102:103], v[110:111] v_mov_b32_e32 v100, s74 v_cmp_gt_i32_e32 vcc, v189, v104 v_cndmask_b32_e32 v236, v108, v100, vcc v_cndmask_b32_e32 v100, v111, v103, vcc v_cndmask_b32_e32 v102, v110, v102, vcc v_cndmask_b32_e32 v101, v109, v101, vcc v_cmp_le_i32_e32 vcc, v190, v104 v_cndmask_b32_e32 v238, v119, v101, vcc v_cmp_le_i32_e32 vcc, v191, v104 v_cmp_le_i32_e64 s[8:9], v186, v104 v_cmp_le_i32_e64 s[10:11], v187, v104 v_cmp_le_i32_e64 s[12:13], v188, v104 v_cndmask_b32_e32 v235, v119, v102, vcc v_cmp_le_i32_e32 vcc, v192, v104 v_cndmask_b32_e64 v240, v119, v113, s[8:9] v_cndmask_b32_e64 v239, v119, v114, s[10:11] v_cndmask_b32_e64 v234, v119, v115, s[12:13] v_cndmask_b32_e32 v237, v119, v100, vcc s_cbranch_scc0 .LBB9_15 ; %bb.10: ; in Loop: Header=BB9_9 Depth=2 v_and_b32_e32 v101, 63, v227 v_and_b32_e32 v103, 64, v227 v_max3_f32 v100, v229, v241, v240 v_xor_b32_e32 v102, 32, v101 v_add_u32_e32 v103, 64, v103 v_max3_f32 v100, v100, v239, v234 v_cmp_lt_i32_e32 vcc, v102, v103 v_max3_f32 v100, v100, v236, v238 v_cndmask_b32_e32 v102, v227, v102, vcc v_max3_f32 v100, v100, v235, v237 v_lshlrev_b32_e32 v102, 2, v102 ds_bpermute_b32 v102, v102, v100 v_xor_b32_e32 v101, 16, v101 v_cmp_lt_i32_e32 vcc, v101, v103 v_cndmask_b32_e32 v101, v227, v101, vcc v_lshlrev_b32_e32 v101, 2, v101 s_waitcnt lgkmcnt(0) v_max_f32_e32 v100, v100, v102 ds_bpermute_b32 v101, v101, v100 s_and_saveexec_b64 s[8:9], s[4:5] s_cbranch_execz .LBB9_12 ; %bb.11: ; in Loop: Header=BB9_9 Depth=2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v100, v100, v101 ds_write_b32 v183, v100 .LBB9_12: ; in Loop: Header=BB9_9 Depth=2 s_or_b64 exec, exec, s[8:9] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB9_14 ; %bb.13: ; in Loop: Header=BB9_9 Depth=2 ds_read_b64 v[100:101], v184 s_waitcnt lgkmcnt(0) v_max_f32_e32 v100, v100, v101 ds_write_b32 v185, v100 offset:256 .LBB9_14: ; %_ZN5flash7SoftmaxILi1EE25softmax_rescale_o_fp8_tp4ILb0ELb1EN4cute6TensorINS3_13array_alignedIfLm8ELm16EEENS3_6LayoutINS3_5tupleIJNS3_1CILi4EEENS9_ILi1EEENS9_ILi2EEEEEENS8_IJSB_NS9_ILi0EEESA_EEEEEEENS4_INS3_10ViewEngineINS3_8smem_ptrIfEEEENS7_INS8_IJNS9_ILi128EEEEEENS8_IJSB_EEEEEEEEEvRT1_RT2_fPDv4_f.exit.i ; in Loop: Header=BB9_9 Depth=2 s_or_b64 exec, exec, s[8:9] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v250, v185 offset:256 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, s74, v250 v_cndmask_b32_e64 v100, v250, 0, vcc v_sub_f32_e32 v100, v229, v100 v_mul_f32_e32 v229, v250, v168 v_cndmask_b32_e64 v229, v229, 0, vcc v_fma_f32 v232, v241, v168, -v229 v_fma_f32 v233, v234, v168, -v229 v_exp_f32_e32 v242, v232 v_fma_f32 v232, v240, v168, -v229 v_exp_f32_e32 v245, v233 v_exp_f32_e32 v243, v232 v_fma_f32 v232, v239, v168, -v229 v_fma_f32 v233, v236, v168, -v229 v_exp_f32_e32 v232, v232 v_exp_f32_e32 v244, v233 v_fma_f32 v233, v238, v168, -v229 v_mul_f32_e32 v100, v100, v168 v_exp_f32_e32 v246, v233 v_fma_f32 v233, v235, v168, -v229 v_fma_f32 v229, v237, v168, -v229 v_exp_f32_e32 v233, v233 v_exp_f32_e32 v247, v229 v_add_f32_e32 v229, v243, v242 v_add_f32_e32 v229, v229, v232 v_add_f32_e32 v229, v229, v245 v_exp_f32_e32 v248, v100 v_add_f32_e32 v229, v229, v244 v_add_f32_e32 v229, v229, v246 v_add_f32_e32 v229, v229, v233 v_add_f32_e32 v229, v229, v247 v_mov_b32_e32 v249, v248 v_fmac_f32_e32 v229, v248, v228 v_pk_mul_f32 v[100:101], v[248:249], v[60:61] v_pk_mul_f32 v[102:103], v[248:249], v[62:63] v_pk_mul_f32 v[104:105], v[248:249], v[56:57] v_pk_mul_f32 v[106:107], v[248:249], v[58:59] v_pk_mul_f32 v[108:109], v[248:249], v[52:53] v_pk_mul_f32 v[110:111], v[248:249], v[54:55] v_pk_mul_f32 v[112:113], v[248:249], v[48:49] v_pk_mul_f32 v[114:115], v[248:249], v[50:51] v_pk_mul_f32 v[116:117], v[248:249], v[44:45] v_pk_mul_f32 v[118:119], v[248:249], v[46:47] v_pk_mul_f32 v[120:121], v[248:249], v[40:41] v_pk_mul_f32 v[122:123], v[248:249], v[42:43] v_pk_mul_f32 v[124:125], v[248:249], v[36:37] v_pk_mul_f32 v[126:127], v[248:249], v[38:39] v_pk_mul_f32 v[128:129], v[248:249], v[32:33] v_pk_mul_f32 v[130:131], v[248:249], v[34:35] v_pk_mul_f32 v[132:133], v[248:249], v[28:29] v_pk_mul_f32 v[134:135], v[248:249], v[30:31] v_pk_mul_f32 v[136:137], v[248:249], v[24:25] v_pk_mul_f32 v[138:139], v[248:249], v[26:27] v_pk_mul_f32 v[140:141], v[248:249], v[20:21] v_pk_mul_f32 v[142:143], v[248:249], v[22:23] v_pk_mul_f32 v[144:145], v[248:249], v[16:17] v_pk_mul_f32 v[146:147], v[248:249], v[18:19] v_pk_mul_f32 v[148:149], v[248:249], v[12:13] v_pk_mul_f32 v[150:151], v[248:249], v[14:15] v_pk_mul_f32 v[152:153], v[248:249], v[8:9] v_pk_mul_f32 v[154:155], v[248:249], v[10:11] v_pk_mul_f32 v[156:157], v[248:249], v[4:5] v_pk_mul_f32 v[158:159], v[248:249], v[6:7] v_pk_mul_f32 v[160:161], v[248:249], v[0:1] v_pk_mul_f32 v[162:163], v[248:249], v[2:3] v_mov_b32_e32 v228, v229 v_mov_b32_e32 v229, v250 s_branch .LBB9_21 .LBB9_15: ; in Loop: Header=BB9_9 Depth=2 ; implicit-def: $vgpr247 ; implicit-def: $vgpr233 ; implicit-def: $vgpr246 ; implicit-def: $vgpr244 ; implicit-def: $vgpr228 ; implicit-def: $vgpr229 ; implicit-def: $vgpr242 ; implicit-def: $vgpr243 ; implicit-def: $vgpr232 ; implicit-def: $vgpr245 ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103 ; implicit-def: $vgpr104_vgpr105_vgpr106_vgpr107 ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111 ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115 ; implicit-def: $vgpr116_vgpr117_vgpr118_vgpr119 ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123 ; implicit-def: $vgpr124_vgpr125_vgpr126_vgpr127 ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131 ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135 ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139 ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143 ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147 ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151 ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155 ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159 ; implicit-def: $vgpr160_vgpr161_vgpr162_vgpr163 s_cbranch_execz .LBB9_21 ; %bb.16: ; in Loop: Header=BB9_9 Depth=2 v_mbcnt_hi_u32_b32 v101, -1, v226 v_and_b32_e32 v102, 63, v101 v_and_b32_e32 v104, 64, v101 v_max_f32_e32 v100, v241, v240 v_xor_b32_e32 v103, 32, v102 v_add_u32_e32 v104, 64, v104 v_max3_f32 v100, v100, v239, v234 v_cmp_lt_i32_e32 vcc, v103, v104 v_max3_f32 v100, v100, v236, v238 v_cndmask_b32_e32 v103, v101, v103, vcc v_max3_f32 v100, v100, v235, v237 v_lshlrev_b32_e32 v103, 2, v103 ds_bpermute_b32 v103, v103, v100 v_xor_b32_e32 v102, 16, v102 v_cmp_lt_i32_e32 vcc, v102, v104 v_cndmask_b32_e32 v101, v101, v102, vcc v_lshlrev_b32_e32 v101, 2, v101 s_waitcnt lgkmcnt(0) v_max_f32_e32 v100, v100, v103 ds_bpermute_b32 v101, v101, v100 s_and_saveexec_b64 s[8:9], s[4:5] s_cbranch_execz .LBB9_18 ; %bb.17: ; in Loop: Header=BB9_9 Depth=2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v100, v100, v101 ds_write_b32 v183, v100 .LBB9_18: ; in Loop: Header=BB9_9 Depth=2 s_or_b64 exec, exec, s[8:9] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB9_20 ; %bb.19: ; in Loop: Header=BB9_9 Depth=2 ds_read_b64 v[100:101], v184 s_waitcnt lgkmcnt(0) v_max_f32_e32 v100, v100, v101 ds_write_b32 v185, v100 offset:256 .LBB9_20: ; %_ZN5flash7SoftmaxILi1EE25softmax_rescale_o_fp8_tp4ILb1ELb1EN4cute6TensorINS3_13array_alignedIfLm8ELm16EEENS3_6LayoutINS3_5tupleIJNS3_1CILi4EEENS9_ILi1EEENS9_ILi2EEEEEENS8_IJSB_NS9_ILi0EEESA_EEEEEEENS4_INS3_10ViewEngineINS3_8smem_ptrIfEEEENS7_INS8_IJNS9_ILi128EEEEEENS8_IJSB_EEEEEEEEEvRT1_RT2_fPDv4_f.exit.i ; in Loop: Header=BB9_9 Depth=2 s_or_b64 exec, exec, s[8:9] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v229, v185 offset:256 v_mov_b64_e32 v[106:107], v[58:59] v_mov_b64_e32 v[110:111], v[54:55] v_mov_b64_e32 v[114:115], v[50:51] v_mov_b64_e32 v[118:119], v[46:47] s_waitcnt lgkmcnt(0) v_mul_f32_e32 v100, v229, v168 v_cmp_lg_f32_e32 vcc, s74, v229 v_cndmask_b32_e32 v100, 0, v100, vcc v_fma_f32 v101, v241, v168, -v100 v_mov_b64_e32 v[122:123], v[42:43] v_exp_f32_e32 v242, v101 v_fma_f32 v101, v240, v168, -v100 v_mov_b64_e32 v[126:127], v[38:39] v_exp_f32_e32 v243, v101 v_fma_f32 v101, v239, v168, -v100 v_mov_b64_e32 v[130:131], v[34:35] v_exp_f32_e32 v232, v101 v_fma_f32 v101, v234, v168, -v100 v_mov_b64_e32 v[134:135], v[30:31] v_exp_f32_e32 v245, v101 v_fma_f32 v101, v236, v168, -v100 v_mov_b64_e32 v[138:139], v[26:27] v_exp_f32_e32 v244, v101 v_fma_f32 v101, v238, v168, -v100 v_mov_b64_e32 v[142:143], v[22:23] v_exp_f32_e32 v246, v101 v_fma_f32 v101, v235, v168, -v100 v_fma_f32 v100, v237, v168, -v100 v_exp_f32_e32 v233, v101 v_exp_f32_e32 v247, v100 v_add_f32_e32 v100, v243, v242 v_add_f32_e32 v100, v100, v232 v_add_f32_e32 v100, v100, v245 v_add_f32_e32 v100, v100, v244 v_add_f32_e32 v100, v100, v246 v_add_f32_e32 v100, v100, v233 v_add_f32_e32 v228, v100, v247 v_mov_b64_e32 v[102:103], v[62:63] v_mov_b64_e32 v[146:147], v[18:19] v_mov_b64_e32 v[150:151], v[14:15] v_mov_b64_e32 v[154:155], v[10:11] v_mov_b64_e32 v[158:159], v[6:7] v_mov_b64_e32 v[162:163], v[2:3] v_mov_b64_e32 v[100:101], v[60:61] v_mov_b64_e32 v[104:105], v[56:57] v_mov_b64_e32 v[108:109], v[52:53] v_mov_b64_e32 v[112:113], v[48:49] v_mov_b64_e32 v[116:117], v[44:45] v_mov_b64_e32 v[120:121], v[40:41] v_mov_b64_e32 v[124:125], v[36:37] v_mov_b64_e32 v[128:129], v[32:33] v_mov_b64_e32 v[132:133], v[28:29] v_mov_b64_e32 v[136:137], v[24:25] v_mov_b64_e32 v[140:141], v[20:21] v_mov_b64_e32 v[144:145], v[16:17] v_mov_b64_e32 v[148:149], v[12:13] v_mov_b64_e32 v[152:153], v[8:9] v_mov_b64_e32 v[156:157], v[4:5] v_mov_b64_e32 v[160:161], v[0:1] .LBB9_21: ; in Loop: Header=BB9_9 Depth=2 s_add_i32 s79, s79, -1 ; sched_barrier mask(0x00000000) v_cvt_pk_fp8_f32 v242, v242, v243, v231 v_cvt_pk_fp8_f32 v244, v244, v246, v230 v_cvt_pk_fp8_f32 v232, v232, v245, v242 op_sel:[0,0,0,1] v_cvt_pk_fp8_f32 v233, v233, v247, v244 op_sel:[0,0,0,1] ds_write2_b32 v214, v232, v233 offset1:2 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b128 v[0:3], v208 offset:40960 ; sched_barrier mask(0x00000000) ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[4:6:8:10], v171 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[5:7:9:11], v200 ds_read_m64x16_b8_alt4 v[12:14:16:18], v202 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[13:15:17:19], v203 v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[0:1], v[4:5], v[100:103] lit v_mov_b32_e32 v4, v8 v_mov_b32_e32 v5, v9 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[0:1], v[4:5], v[108:111] lit v_mov_b32_e32 v4, v10 v_mov_b32_e32 v5, v11 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[0:1], v[4:5], v[112:115] lit v_mov_b32_e32 v4, v12 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v5, v13 v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[0:1], v[6:7], v[104:107] lit s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[2:3], v[4:5], v[100:103] lit v_mov_b32_e32 v4, v14 v_mov_b32_e32 v5, v15 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[2:3], v[4:5], v[104:107] lit v_mov_b32_e32 v4, v16 v_mov_b32_e32 v5, v17 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[2:3], v[4:5], v[108:111] lit ds_read_m64x16_b8_alt4 v[4:6:8:10], v206 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[5:7:9:11], v215 v_mov_b32_e32 v12, v18 v_mov_b32_e32 v13, v19 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[2:3], v[12:13], v[112:115] lit ds_read_m64x16_b8_alt4 v[12:14:16:18], v216 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[13:15:17:19], v217 v_mmac_f32_16x16x32_fp8_fp8 v[116:119], v[0:1], v[4:5], v[116:119] lit v_mov_b32_e32 v4, v6 v_mov_b32_e32 v5, v7 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[120:123], v[0:1], v[4:5], v[120:123] lit v_mov_b32_e32 v4, v8 v_mov_b32_e32 v5, v9 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[124:127], v[0:1], v[4:5], v[124:127] lit v_mov_b32_e32 v4, v10 v_mov_b32_e32 v5, v11 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[128:131], v[0:1], v[4:5], v[128:131] lit v_mov_b32_e32 v4, v12 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v5, v13 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[116:119], v[2:3], v[4:5], v[116:119] lit v_mov_b32_e32 v4, v14 v_mov_b32_e32 v5, v15 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[120:123], v[2:3], v[4:5], v[120:123] lit v_mov_b32_e32 v4, v16 v_mov_b32_e32 v5, v17 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[124:127], v[2:3], v[4:5], v[124:127] lit ds_read_m64x16_b8_alt4 v[4:6:8:10], v218 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[5:7:9:11], v219 v_mov_b32_e32 v12, v18 v_mov_b32_e32 v13, v19 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[128:131], v[2:3], v[12:13], v[128:131] lit ds_read_m64x16_b8_alt4 v[12:14:16:18], v220 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[13:15:17:19], v221 v_mmac_f32_16x16x32_fp8_fp8 v[132:135], v[0:1], v[4:5], v[132:135] lit v_mov_b32_e32 v4, v6 v_mov_b32_e32 v5, v7 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[136:139], v[0:1], v[4:5], v[136:139] lit v_mov_b32_e32 v4, v8 v_mov_b32_e32 v5, v9 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[140:143], v[0:1], v[4:5], v[140:143] lit v_mov_b32_e32 v4, v10 v_mov_b32_e32 v5, v11 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[144:147], v[0:1], v[4:5], v[144:147] lit v_mov_b32_e32 v4, v12 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v5, v13 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[132:135], v[2:3], v[4:5], v[132:135] lit v_mov_b32_e32 v4, v14 v_mov_b32_e32 v5, v15 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[136:139], v[2:3], v[4:5], v[136:139] lit v_mov_b32_e32 v4, v16 v_mov_b32_e32 v5, v17 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[140:143], v[2:3], v[4:5], v[140:143] lit ds_read_m64x16_b8_alt4 v[4:6:8:10], v222 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[5:7:9:11], v223 v_mov_b32_e32 v12, v18 v_mov_b32_e32 v13, v19 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[144:147], v[2:3], v[12:13], v[144:147] lit ds_read_m64x16_b8_alt4 v[12:14:16:18], v224 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[13:15:17:19], v225 v_mmac_f32_16x16x32_fp8_fp8 v[148:151], v[0:1], v[4:5], v[148:151] lit v_mov_b32_e32 v4, v6 v_mov_b32_e32 v5, v7 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[152:155], v[0:1], v[4:5], v[152:155] lit v_mov_b32_e32 v4, v8 v_mov_b32_e32 v5, v9 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[156:159], v[0:1], v[4:5], v[156:159] lit v_mov_b32_e32 v4, v10 v_mov_b32_e32 v5, v11 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[160:163], v[0:1], v[4:5], v[160:163] lit v_mov_b32_e32 v0, v12 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v1, v13 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[148:151], v[2:3], v[0:1], v[148:151] lit v_mov_b32_e32 v0, v14 v_mov_b32_e32 v1, v15 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[152:155], v[2:3], v[0:1], v[152:155] lit v_mov_b32_e32 v0, v16 v_mov_b32_e32 v1, v17 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[156:159], v[2:3], v[0:1], v[156:159] lit v_mov_b32_e32 v0, v18 v_mov_b32_e32 v1, v19 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[160:163], v[2:3], v[0:1], v[160:163] lit ; sched_barrier mask(0x00000000) s_add_i32 s77, s77, 64 s_add_u32 s66, s66, -4 s_addc_u32 s67, s67, -1 v_mov_b32_e32 v0, v160 v_mov_b32_e32 v1, v161 v_mov_b32_e32 v2, v162 v_mov_b32_e32 v3, v163 v_mov_b32_e32 v4, v156 v_mov_b32_e32 v5, v157 v_mov_b32_e32 v6, v158 v_mov_b32_e32 v7, v159 v_mov_b32_e32 v8, v152 v_mov_b32_e32 v9, v153 v_mov_b32_e32 v10, v154 v_mov_b32_e32 v11, v155 v_mov_b32_e32 v12, v148 v_mov_b32_e32 v13, v149 v_mov_b32_e32 v14, v150 v_mov_b32_e32 v15, v151 v_mov_b32_e32 v16, v144 v_mov_b32_e32 v17, v145 v_mov_b32_e32 v18, v146 v_mov_b32_e32 v19, v147 v_mov_b32_e32 v20, v140 v_mov_b32_e32 v21, v141 v_mov_b32_e32 v22, v142 v_mov_b32_e32 v23, v143 v_mov_b32_e32 v24, v136 v_mov_b32_e32 v25, v137 v_mov_b32_e32 v26, v138 v_mov_b32_e32 v27, v139 v_mov_b32_e32 v28, v132 v_mov_b32_e32 v29, v133 v_mov_b32_e32 v30, v134 v_mov_b32_e32 v31, v135 v_mov_b32_e32 v32, v128 v_mov_b32_e32 v33, v129 v_mov_b32_e32 v34, v130 v_mov_b32_e32 v35, v131 v_mov_b32_e32 v36, v124 v_mov_b32_e32 v37, v125 v_mov_b32_e32 v38, v126 v_mov_b32_e32 v39, v127 v_mov_b32_e32 v40, v120 v_mov_b32_e32 v41, v121 v_mov_b32_e32 v42, v122 v_mov_b32_e32 v43, v123 v_mov_b32_e32 v44, v116 v_mov_b32_e32 v45, v117 v_mov_b32_e32 v46, v118 v_mov_b32_e32 v47, v119 v_mov_b32_e32 v48, v112 v_mov_b32_e32 v49, v113 v_mov_b32_e32 v50, v114 v_mov_b32_e32 v51, v115 v_mov_b32_e32 v52, v108 v_mov_b32_e32 v53, v109 v_mov_b32_e32 v54, v110 v_mov_b32_e32 v55, v111 v_mov_b32_e32 v56, v104 v_mov_b32_e32 v57, v105 v_mov_b32_e32 v58, v106 v_mov_b32_e32 v59, v107 v_mov_b32_e32 v60, v100 v_mov_b32_e32 v61, v101 v_mov_b32_e32 v62, v102 s_cmp_le_i32 s79, s75 v_mov_b32_e32 v63, v103 s_cbranch_scc1 .LBB9_23 ; %bb.22: ; in Loop: Header=BB9_9 Depth=2 v_mov_b32_e32 v231, v232 v_mov_b32_e32 v230, v233 s_branch .LBB9_9 .LBB9_23: ; %Flow918 ; in Loop: Header=BB9_5 Depth=1 s_cmp_eq_u32 s75, 0 s_cselect_b64 s[8:9], -1, 0 s_ashr_i32 s10, s76, 31 s_lshr_b32 s10, s10, 26 s_add_i32 s76, s76, s10 s_ashr_i32 s10, s76, 6 s_cmp_eq_u32 s64, s10 s_cselect_b64 s[10:11], -1, 0 s_and_b64 s[8:9], s[8:9], s[10:11] s_andn2_b64 vcc, exec, s[8:9] s_mov_b64 s[8:9], -1 s_cbranch_vccnz .LBB9_25 ; %bb.24: ; %Flow912 ; in Loop: Header=BB9_5 Depth=1 s_and_b64 vcc, exec, s[8:9] s_cbranch_vccz .LBB9_4 s_branch .LBB9_34 .LBB9_25: ; in Loop: Header=BB9_5 Depth=1 s_add_u32 s8, s40, s62 s_addc_u32 s9, s41, s63 global_load_dword v66, v164, s[8:9] v_mbcnt_hi_u32_b32 v64, -1, v226 v_and_b32_e32 v65, 63, v64 v_and_b32_e32 v68, 64, v64 v_xor_b32_e32 v67, 32, v65 v_add_u32_e32 v68, 64, v68 v_cmp_lt_i32_e32 vcc, v67, v68 v_cndmask_b32_e32 v67, v64, v67, vcc v_lshlrev_b32_e32 v67, 2, v67 ds_bpermute_b32 v67, v67, v228 v_xor_b32_e32 v65, 16, v65 v_cmp_lt_i32_e32 vcc, v65, v68 v_cndmask_b32_e32 v64, v64, v65, vcc v_lshlrev_b32_e32 v65, 2, v64 s_waitcnt lgkmcnt(0) v_add_f32_e32 v64, v67, v228 ds_bpermute_b32 v65, v65, v64 s_waitcnt vmcnt(0) v_readfirstlane_b32 s10, v66 s_and_saveexec_b64 s[8:9], s[4:5] s_cbranch_execz .LBB9_27 ; %bb.26: ; in Loop: Header=BB9_5 Depth=1 s_waitcnt lgkmcnt(0) v_add_f32_e32 v64, v64, v65 ds_write_b32 v193, v64 .LBB9_27: ; in Loop: Header=BB9_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB9_29 ; %bb.28: ; in Loop: Header=BB9_5 Depth=1 ds_read_b64 v[64:65], v194 s_waitcnt lgkmcnt(0) v_add_f32_e32 v64, v65, v64 ds_write_b32 v205, v64 offset:256 .LBB9_29: ; %_ZN5flash7SoftmaxILi1EE29normalize_softmax_lse_fp8_tp4ILb0ELb1ELb1EN4cute6TensorINS3_10ViewEngineINS3_8smem_ptrIfEEEENS3_6LayoutINS3_5tupleIJNS3_1CILi128EEEEEENSA_IJNSB_ILi1EEEEEEEEEEEENS4_INS3_13array_alignedIfLm1ELm16EEENS9_ISF_NSA_IJNSB_ILi0EEEEEEEEEEPDv4_fRT2_fff.exit.i ; in Loop: Header=BB9_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_and_b64 s[8:9], s[42:43], exec s_cselect_b32 s8, s38, 0 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v64, v205 offset:256 s_add_i32 s8, s10, s8 s_mul_i32 s8, s8, s37 s_add_i32 s8, s8, s7 s_mul_i32 s8, s8, s35 s_add_i32 s10, s8, s6 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, 0, v64 s_and_saveexec_b64 s[12:13], s[60:61] s_cbranch_execz .LBB9_31 ; %bb.30: ; in Loop: Header=BB9_5 Depth=1 v_log_f32_e32 v65, v64 s_ashr_i32 s11, s10, 31 v_mov_b32_e32 v66, 0xff800000 s_lshl_b64 s[8:9], s[10:11], 2 v_mul_f32_e32 v65, 0x3f317218, v65 v_fmac_f32_e32 v65, v229, v167 v_cndmask_b32_e32 v67, v65, v66, vcc v_mov_b32_e32 v66, s9 v_add_co_u32_e64 v65, s[8:9], s8, v195 v_addc_co_u32_e64 v66, s[8:9], v196, v66, s[8:9] global_store_dword v[65:66], v67, off .LBB9_31: ; %.loopexit713.i ; in Loop: Header=BB9_5 Depth=1 s_or_b64 exec, exec, s[12:13] s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execz .LBB9_33 ; %bb.32: ; %.preheader710.i ; in Loop: Header=BB9_5 Depth=1 v_rcp_f32_e32 v67, v64 s_mul_i32 s10, s10, s36 v_mov_b32_e32 v64, v60 v_mov_b32_e32 v65, v56 v_mul_f32_e32 v67, s34, v67 v_cndmask_b32_e64 v122, v67, 1.0, vcc v_mov_b32_e32 v123, v122 v_mov_b32_e32 v78, v45 v_mov_b32_e32 v79, v41 v_mov_b32_e32 v94, v29 v_mov_b32_e32 v95, v25 v_mov_b32_e32 v110, v13 v_mov_b32_e32 v111, v9 s_ashr_i32 s11, s10, 31 v_pk_mul_f32 v[124:125], v[122:123], v[64:65] v_mov_b32_e32 v64, v62 v_mov_b32_e32 v65, v58 v_pk_mul_f32 v[80:81], v[122:123], v[78:79] v_mov_b32_e32 v78, v46 v_mov_b32_e32 v79, v42 v_pk_mul_f32 v[96:97], v[122:123], v[94:95] v_mov_b32_e32 v94, v30 v_mov_b32_e32 v95, v26 v_pk_mul_f32 v[112:113], v[122:123], v[110:111] v_mov_b32_e32 v110, v14 v_mov_b32_e32 v111, v10 s_lshl_b64 s[10:11], s[10:11], 2 v_pk_mul_f32 v[70:71], v[122:123], v[64:65] v_mov_b32_e32 v64, v63 v_mov_b32_e32 v65, v59 v_mov_b32_e32 v76, v55 v_mov_b32_e32 v77, v51 v_pk_mul_f32 v[84:85], v[122:123], v[78:79] v_mov_b32_e32 v78, v47 v_mov_b32_e32 v79, v43 v_pk_mul_f32 v[100:101], v[122:123], v[94:95] v_mov_b32_e32 v94, v31 v_mov_b32_e32 v95, v27 v_pk_mul_f32 v[116:117], v[122:123], v[110:111] v_mov_b32_e32 v110, v15 v_mov_b32_e32 v111, v11 s_add_u32 s10, s56, s10 v_mov_b32_e32 v66, v61 v_mov_b32_e32 v67, v57 v_pk_mul_f32 v[74:75], v[122:123], v[64:65] v_mov_b32_e32 v64, v52 v_mov_b32_e32 v65, v48 v_mov_b32_e32 v68, v53 v_mov_b32_e32 v69, v49 v_mov_b32_e32 v72, v54 v_mov_b32_e32 v73, v50 v_pk_mul_f32 v[126:127], v[122:123], v[76:77] v_mov_b32_e32 v76, v44 v_mov_b32_e32 v77, v40 v_pk_mul_f32 v[88:89], v[122:123], v[78:79] v_mov_b32_e32 v78, v36 v_mov_b32_e32 v79, v32 v_mov_b32_e32 v82, v37 v_mov_b32_e32 v83, v33 v_mov_b32_e32 v86, v38 v_mov_b32_e32 v87, v34 v_mov_b32_e32 v90, v39 v_mov_b32_e32 v91, v35 v_mov_b32_e32 v92, v28 v_mov_b32_e32 v93, v24 v_pk_mul_f32 v[104:105], v[122:123], v[94:95] v_mov_b32_e32 v94, v20 v_mov_b32_e32 v95, v16 v_mov_b32_e32 v98, v21 v_mov_b32_e32 v99, v17 v_mov_b32_e32 v102, v22 v_mov_b32_e32 v103, v18 v_mov_b32_e32 v106, v23 v_mov_b32_e32 v107, v19 v_mov_b32_e32 v108, v12 v_mov_b32_e32 v109, v8 v_pk_mul_f32 v[120:121], v[122:123], v[110:111] v_mov_b32_e32 v110, v4 v_mov_b32_e32 v111, v0 v_mov_b32_e32 v114, v5 v_mov_b32_e32 v115, v1 v_mov_b32_e32 v118, v6 v_mov_b32_e32 v119, v2 v_mov_b32_e32 v128, v7 v_mov_b32_e32 v129, v3 s_addc_u32 s11, s57, s11 v_pk_mul_f32 v[66:67], v[122:123], v[66:67] v_pk_mul_f32 v[64:65], v[122:123], v[64:65] v_pk_mul_f32 v[68:69], v[122:123], v[68:69] v_pk_mul_f32 v[72:73], v[122:123], v[72:73] v_pk_mul_f32 v[76:77], v[122:123], v[76:77] v_pk_mul_f32 v[78:79], v[122:123], v[78:79] v_pk_mul_f32 v[82:83], v[122:123], v[82:83] v_pk_mul_f32 v[86:87], v[122:123], v[86:87] v_pk_mul_f32 v[90:91], v[122:123], v[90:91] v_pk_mul_f32 v[92:93], v[122:123], v[92:93] v_pk_mul_f32 v[94:95], v[122:123], v[94:95] v_pk_mul_f32 v[98:99], v[122:123], v[98:99] v_pk_mul_f32 v[102:103], v[122:123], v[102:103] v_pk_mul_f32 v[106:107], v[122:123], v[106:107] v_pk_mul_f32 v[108:109], v[122:123], v[108:109] v_pk_mul_f32 v[110:111], v[122:123], v[110:111] v_pk_mul_f32 v[114:115], v[122:123], v[114:115] v_pk_mul_f32 v[118:119], v[122:123], v[118:119] v_pk_mul_f32 v[122:123], v[122:123], v[128:129] global_store_dwordx2 v209, v[124:125], s[10:11] global_store_dwordx4 v213, v[64:67], s[10:11] offset:8 global_store_dwordx4 v213, v[68:71], s[10:11] offset:24 global_store_dwordx4 v213, v[72:75], s[10:11] offset:40 global_store_dwordx2 v213, v[126:127], s[10:11] offset:56 global_store_dwordx4 v213, v[76:79], s[10:11] offset:512 global_store_dwordx4 v210, v[80:83], s[10:11] offset:16 global_store_dwordx4 v210, v[84:87], s[10:11] offset:32 global_store_dwordx4 v210, v[88:91], s[10:11] offset:48 global_store_dwordx4 v213, v[92:95], s[10:11] offset:1024 global_store_dwordx4 v211, v[96:99], s[10:11] offset:16 global_store_dwordx4 v211, v[100:103], s[10:11] offset:32 global_store_dwordx4 v211, v[104:107], s[10:11] offset:48 global_store_dwordx4 v213, v[108:111], s[10:11] offset:1536 global_store_dwordx4 v212, v[112:115], s[10:11] offset:16 global_store_dwordx4 v212, v[116:119], s[10:11] offset:32 global_store_dwordx4 v212, v[120:123], s[10:11] offset:48 .LBB9_33: ; %Flow ; in Loop: Header=BB9_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_branch .LBB9_4 .LBB9_34: ; in Loop: Header=BB9_5 Depth=1 v_mbcnt_hi_u32_b32 v64, -1, v226 v_and_b32_e32 v65, 63, v64 v_and_b32_e32 v67, 64, v64 v_xor_b32_e32 v66, 32, v65 v_add_u32_e32 v67, 64, v67 v_cmp_lt_i32_e32 vcc, v66, v67 v_cndmask_b32_e32 v66, v64, v66, vcc v_lshlrev_b32_e32 v66, 2, v66 ds_bpermute_b32 v66, v66, v228 v_xor_b32_e32 v65, 16, v65 v_cmp_lt_i32_e32 vcc, v65, v67 v_cndmask_b32_e32 v64, v64, v65, vcc v_lshlrev_b32_e32 v65, 2, v64 s_waitcnt lgkmcnt(0) v_add_f32_e32 v64, v66, v228 ds_bpermute_b32 v65, v65, v64 s_and_saveexec_b64 s[8:9], s[4:5] s_cbranch_execz .LBB9_36 ; %bb.35: ; in Loop: Header=BB9_5 Depth=1 s_waitcnt lgkmcnt(0) v_add_f32_e32 v64, v64, v65 ds_write_b32 v193, v64 .LBB9_36: ; in Loop: Header=BB9_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB9_38 ; %bb.37: ; in Loop: Header=BB9_5 Depth=1 ds_read_b64 v[64:65], v194 s_waitcnt lgkmcnt(0) v_add_f32_e32 v64, v65, v64 ds_write_b32 v205, v64 offset:256 .LBB9_38: ; %_ZN5flash7SoftmaxILi1EE29normalize_softmax_lse_fp8_tp4ILb0ELb0ELb1EN4cute6TensorINS3_10ViewEngineINS3_8smem_ptrIfEEEENS3_6LayoutINS3_5tupleIJNS3_1CILi128EEEEEENSA_IJNSB_ILi1EEEEEEEEEEEENS4_INS3_13array_alignedIfLm1ELm16EEENS9_ISF_NSA_IJNSB_ILi0EEEEEEEEEEPDv4_fRT2_fff.exit.i ; in Loop: Header=BB9_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v64, v205 offset:256 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, 0, v64 s_and_saveexec_b64 s[10:11], s[60:61] s_cbranch_execz .LBB9_40 ; %bb.39: ; in Loop: Header=BB9_5 Depth=1 s_mul_i32 s8, s26, s37 v_log_f32_e32 v65, v64 s_add_i32 s8, s8, s7 s_mul_i32 s8, s8, s35 s_add_i32 s8, s8, s6 s_ashr_i32 s9, s8, 31 v_mul_f32_e32 v65, 0x3f317218, v65 v_fmac_f32_e32 v65, v229, v167 v_mov_b32_e32 v66, 0x7f800000 s_lshl_b64 s[8:9], s[8:9], 2 v_cndmask_b32_e32 v67, v65, v66, vcc v_mov_b32_e32 v66, s9 v_add_co_u32_e64 v65, s[8:9], s8, v198 v_addc_co_u32_e64 v66, s[8:9], v199, v66, s[8:9] global_store_dword v[65:66], v67, off .LBB9_40: ; %.loopexit.i ; in Loop: Header=BB9_5 Depth=1 s_or_b64 exec, exec, s[10:11] s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execz .LBB9_3 ; %bb.41: ; %.preheader.i ; in Loop: Header=BB9_5 Depth=1 s_mul_i32 s10, s26, s29 s_mul_hi_u32 s11, s26, s28 s_add_i32 s10, s11, s10 s_mul_i32 s11, s27, s28 v_rcp_f32_e32 v64, v64 s_add_i32 s11, s10, s11 s_mul_i32 s10, s26, s28 s_add_u32 s10, s72, s10 s_addc_u32 s11, s73, s11 s_lshl_b64 s[10:11], s[10:11], 1 v_mul_f32_e32 v64, s34, v64 s_add_u32 s10, s24, s10 v_cndmask_b32_e64 v64, v64, 1.0, vcc s_addc_u32 s11, s25, s11 v_mul_f32_e32 v60, v64, v60 v_mul_f32_e32 v61, v64, v61 v_mul_f32_e32 v56, v64, v56 v_mul_f32_e32 v57, v64, v57 v_mul_f32_e32 v52, v64, v52 v_mul_f32_e32 v53, v64, v53 v_mul_f32_e32 v48, v64, v48 v_mul_f32_e32 v49, v64, v49 v_mul_f32_e32 v65, v64, v16 v_mul_f32_e32 v81, v64, v2 v_mov_b32_e32 v2, s11 v_add_co_u32_e32 v16, vcc, s10, v165 v_mul_f32_e32 v62, v64, v62 v_mul_f32_e32 v63, v64, v63 v_mul_f32_e32 v58, v64, v58 v_mul_f32_e32 v59, v64, v59 v_mul_f32_e32 v54, v64, v54 v_mul_f32_e32 v55, v64, v55 v_mul_f32_e32 v50, v64, v50 v_mul_f32_e32 v51, v64, v51 v_mul_f32_e32 v44, v64, v44 v_mul_f32_e32 v45, v64, v45 v_mul_f32_e32 v46, v64, v46 v_mul_f32_e32 v47, v64, v47 v_mul_f32_e32 v40, v64, v40 v_mul_f32_e32 v41, v64, v41 v_mul_f32_e32 v42, v64, v42 v_mul_f32_e32 v43, v64, v43 v_mul_f32_e32 v36, v64, v36 v_mul_f32_e32 v37, v64, v37 v_mul_f32_e32 v38, v64, v38 v_mul_f32_e32 v39, v64, v39 v_mul_f32_e32 v32, v64, v32 v_mul_f32_e32 v33, v64, v33 v_mul_f32_e32 v34, v64, v34 v_mul_f32_e32 v35, v64, v35 v_mul_f32_e32 v28, v64, v28 v_mul_f32_e32 v29, v64, v29 v_mul_f32_e32 v30, v64, v30 v_mul_f32_e32 v31, v64, v31 v_mul_f32_e32 v24, v64, v24 v_mul_f32_e32 v25, v64, v25 v_mul_f32_e32 v26, v64, v26 v_mul_f32_e32 v27, v64, v27 v_mul_f32_e32 v20, v64, v20 v_mul_f32_e32 v21, v64, v21 v_mul_f32_e32 v22, v64, v22 v_mul_f32_e32 v23, v64, v23 v_mul_f32_e32 v66, v64, v17 v_mul_f32_e32 v18, v64, v18 v_mul_f32_e32 v19, v64, v19 v_mul_f32_e32 v67, v64, v12 v_mul_f32_e32 v68, v64, v13 v_mul_f32_e32 v69, v64, v14 v_mul_f32_e32 v70, v64, v15 v_mul_f32_e32 v71, v64, v8 v_mul_f32_e32 v72, v64, v9 v_mul_f32_e32 v73, v64, v10 v_mul_f32_e32 v74, v64, v11 v_mul_f32_e32 v75, v64, v4 v_mul_f32_e32 v76, v64, v5 v_mul_f32_e32 v77, v64, v6 v_mul_f32_e32 v78, v64, v7 v_mul_f32_e32 v79, v64, v0 v_mul_f32_e32 v80, v64, v1 v_mul_f32_e32 v64, v64, v3 v_cvt_pk_bf16_f32 v0, v60, v56 v_cvt_pk_bf16_f32 v1, v52, v48 v_addc_co_u32_e32 v17, vcc, v2, v166, vcc v_cvt_pk_bf16_f32 v2, v61, v57 v_cvt_pk_bf16_f32 v3, v53, v49 v_cvt_pk_bf16_f32 v4, v62, v58 v_cvt_pk_bf16_f32 v5, v54, v50 v_cvt_pk_bf16_f32 v6, v63, v59 v_cvt_pk_bf16_f32 v7, v55, v51 v_cvt_pk_bf16_f32 v8, v44, v40 v_cvt_pk_bf16_f32 v9, v36, v32 v_cvt_pk_bf16_f32 v10, v45, v41 v_cvt_pk_bf16_f32 v11, v37, v33 v_cvt_pk_bf16_f32 v12, v46, v42 v_cvt_pk_bf16_f32 v13, v38, v34 v_cvt_pk_bf16_f32 v14, v47, v43 v_cvt_pk_bf16_f32 v15, v39, v35 global_store_dwordx4 v[16:17], v[0:3], off global_store_dwordx4 v[16:17], v[4:7], off offset:16 global_store_dwordx4 v[16:17], v[8:11], off offset:256 global_store_dwordx4 v[16:17], v[12:15], off offset:272 v_cvt_pk_bf16_f32 v0, v28, v24 v_cvt_pk_bf16_f32 v1, v20, v65 v_cvt_pk_bf16_f32 v2, v29, v25 v_cvt_pk_bf16_f32 v3, v21, v66 v_cvt_pk_bf16_f32 v4, v30, v26 v_cvt_pk_bf16_f32 v5, v22, v18 v_cvt_pk_bf16_f32 v6, v31, v27 v_cvt_pk_bf16_f32 v7, v23, v19 v_cvt_pk_bf16_f32 v8, v67, v71 v_cvt_pk_bf16_f32 v9, v75, v79 v_cvt_pk_bf16_f32 v10, v68, v72 v_cvt_pk_bf16_f32 v11, v76, v80 v_cvt_pk_bf16_f32 v12, v69, v73 v_cvt_pk_bf16_f32 v13, v77, v81 v_cvt_pk_bf16_f32 v14, v70, v74 v_cvt_pk_bf16_f32 v15, v78, v64 global_store_dwordx4 v[16:17], v[0:3], off offset:512 global_store_dwordx4 v[16:17], v[4:7], off offset:528 global_store_dwordx4 v[16:17], v[8:11], off offset:768 global_store_dwordx4 v[16:17], v[12:15], off offset:784 s_branch .LBB9_3 .LBB9_42: ; %.loopexit s_endpgm .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params .amdhsa_group_segment_fixed_size 0 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 336 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 1 .amdhsa_system_sgpr_workgroup_id_z 1 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 251 .amdhsa_next_free_sgpr 80 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_bf16_overflow 0 .amdhsa_bf8_overflow 1 .amdhsa_fp8_overflow 1 .amdhsa_bf16_denorm_mode 3 .amdhsa_load_store_out_of_order 0 .amdhsa_matrix_excp 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params,#alloc,#execinstr .Lfunc_end9: .size _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params, .Lfunc_end9-_ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 8832 ; NumSgprs: 84 ; NumVgprs: 251 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 0 bytes/workgroup (compile time only) ; SGPRBlocks: 10 ; VGPRBlocks: 62 ; NumSGPRsForWavesPerEU: 84 ; NumVGPRsForWavesPerEU: 251 ; Occupancy: 1 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF16_OVFL: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:FP8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:BF16_DENORM: 3 .section .text._ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params,#alloc,#execinstr .protected _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params ; -- Begin function _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params .globl _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params .p2align 8 .type _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params,@function _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params: ; @_ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params ; %bb.0: s_load_dwordx2 s[0:1], s[4:5], 0x110 s_load_dwordx2 s[34:35], s[4:5], 0x0 s_lshl_b32 s2, s8, 3 s_ashr_i32 s3, s2, 31 s_lshl_b64 s[2:3], s[2:3], 2 s_waitcnt lgkmcnt(0) s_add_u32 s0, s0, s2 s_addc_u32 s1, s1, s3 s_load_dwordx4 s[12:15], s[0:1], 0x0 s_waitcnt lgkmcnt(0) s_cmp_ge_i32 s12, s34 s_cbranch_scc1 .LBB10_42 ; %bb.1: s_cmp_gt_i32 s12, s14 s_cbranch_scc1 .LBB10_42 ; %bb.2: ; %.lr.ph s_load_dwordx4 s[16:19], s[4:5], 0x28 s_load_dwordx2 s[2:3], s[4:5], 0x48 s_load_dwordx2 s[48:49], s[4:5], 0x68 s_load_dwordx4 s[20:23], s[4:5], 0x58 s_load_dwordx2 s[50:51], s[4:5], 0x80 s_load_dwordx4 s[24:27], s[4:5], 0x90 s_load_dwordx2 s[52:53], s[4:5], 0x130 s_load_dwordx4 s[8:11], s[4:5], 0x140 s_load_dword s33, s[4:5], 0xb0 s_load_dwordx4 s[40:43], s[4:5], 0xc0 s_load_dwordx4 s[28:31], s[4:5], 0xc s_load_dwordx2 s[36:37], s[4:5], 0x20 s_waitcnt lgkmcnt(0) s_load_dword s31, s[8:9], 0x0 s_load_dword s34, s[10:11], 0x0 s_ashr_i32 s8, s13, 31 s_lshr_b32 s8, s8, 26 s_lshl_b32 s6, s6, 5 s_add_i32 s8, s13, s8 s_ashr_i32 s44, s6, 31 s_ashr_i32 s45, s7, 31 s_ashr_i32 s13, s8, 6 s_mul_i32 s8, s26, s44 s_mul_hi_u32 s9, s26, s6 s_mul_hi_u32 s10, s42, s7 s_mul_i32 s11, s42, s45 s_add_i32 s8, s9, s8 s_mul_i32 s9, s27, s6 s_add_i32 s10, s10, s11 s_mul_i32 s11, s43, s7 s_add_i32 s8, s8, s9 s_mul_i32 s9, s26, s6 s_add_i32 s10, s10, s11 s_mul_i32 s11, s42, s7 s_add_u32 s27, s11, s9 s_addc_u32 s60, s10, s8 s_ashr_i32 s38, s30, 31 s_add_i32 s8, s30, s38 s_xor_b32 s30, s8, s38 v_cvt_f32_u32_e32 v1, s30 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v2, s31 v_mul_f32_e32 v2, s34, v2 v_mul_f32_e32 v167, s36, v2 v_rcp_iflag_f32_e32 v1, v1 v_mul_f32_e32 v168, s37, v2 s_sub_i32 s36, 0, s30 s_add_i32 s31, s7, s45 v_mul_f32_e32 v1, 0x4f7ffffe, v1 v_cvt_u32_f32_e32 v1, v1 s_xor_b32 s31, s31, s45 s_xor_b32 s46, s45, s38 v_lshlrev_b32_e32 v5, 4, v0 v_readfirstlane_b32 s37, v1 s_mul_i32 s36, s36, s37 s_mul_hi_u32 s36, s37, s36 s_add_i32 s37, s37, s36 s_mul_hi_u32 s36, s31, s37 s_mul_i32 s37, s36, s30 s_sub_i32 s31, s31, s37 s_add_i32 s37, s36, 1 s_sub_i32 s38, s31, s30 s_cmp_ge_u32 s31, s30 s_cselect_b32 s47, s37, s36 s_cselect_b32 s31, s38, s31 s_add_i32 s54, s47, 1 s_cmp_ge_u32 s31, s30 v_lshlrev_b32_e32 v9, 1, v0 s_load_dwordx2 s[42:43], s[4:5], 0xe0 s_load_dwordx4 s[8:11], s[4:5], 0xf0 s_load_dwordx2 s[30:31], s[4:5], 0x100 s_load_dwordx4 s[36:39], s[4:5], 0x120 s_load_dword s61, s[0:1], 0x10 s_cselect_b32 s0, s54, s47 v_and_b32_e32 v8, 0x3f0, v5 v_and_b32_e32 v9, 0x70, v9 s_xor_b32 s0, s0, s46 v_lshrrev_b32_e32 v2, 7, v0 v_xor_b32_e32 v8, v8, v9 s_sub_i32 s0, s0, s46 v_and_b32_e32 v1, 63, v0 v_lshl_or_b32 v8, v2, 12, v8 s_ashr_i32 s1, s0, 31 v_add_u32_e32 v171, 0, v8 v_lshrrev_b32_e32 v8, 3, v0 v_cmp_lt_u32_e32 vcc, 31, v1 s_waitcnt lgkmcnt(0) s_mul_i32 s1, s42, s1 s_mul_hi_u32 s4, s42, s0 v_lshl_add_u32 v4, v1, 4, 0 v_xor_b32_e32 v8, v8, v0 v_bfe_u32 v9, v0, 2, 4 v_cndmask_b32_e64 v1, 0, 1, vcc s_add_i32 s1, s4, s1 s_mul_i32 s4, s43, s0 v_xor_b32_e32 v172, v9, v1 v_lshlrev_b32_e32 v1, 4, v8 s_add_i32 s1, s1, s4 s_mul_i32 s0, s42, s0 v_and_b32_e32 v173, 48, v1 v_lshrrev_b32_e32 v1, 2, v0 s_add_u32 s62, s2, s0 v_and_b32_e32 v169, 15, v0 v_and_b32_e32 v170, 48, v0 v_lshlrev_b32_e32 v3, 6, v0 s_movk_i32 s0, 0x3c0 v_lshlrev_b32_e32 v7, 3, v0 v_and_b32_e32 v8, 12, v1 v_and_or_b32 v3, v3, s0, v170 v_and_b32_e32 v7, 0x70, v7 v_lshl_or_b32 v182, v2, 4, v8 v_and_or_b32 v8, v1, 16, v169 v_xor_b32_e32 v3, v3, v7 v_lshlrev_b32_e32 v1, 3, v8 s_addc_u32 s63, s3, s1 v_lshl_or_b32 v7, v2, 10, v3 v_lshrrev_b32_e32 v3, 5, v0 v_lshl_or_b32 v9, v2, 2, v1 s_add_i32 s2, 0, 0xaa00 v_and_b32_e32 v6, 0x400, v5 v_or_b32_e32 v3, v5, v3 v_add_u32_e32 v183, s2, v9 v_add_u32_e32 v184, s2, v1 s_movk_i32 s2, 0x3f4 v_and_or_b32 v11, v3, s2, v6 s_add_i32 s2, 0, 0xa800 v_add_u32_e32 v195, s2, v1 v_bfe_u32 v1, v0, 4, 2 v_add_u32_e32 v194, s2, v9 v_lshl_or_b32 v9, v2, 6, v1 v_lshrrev_b32_e32 v1, 1, v0 v_and_or_b32 v163, v1, 64, v170 v_mov_b32_e32 v164, 0 v_mad_u64_u32 v[1:2], s[2:3], s40, v8, v[163:164] s_sub_i32 s64, s35, s6 v_and_b32_e32 v10, 0xb0, v0 v_mad_u64_u32 v[2:3], s[2:3], s41, v8, v[2:3] v_cmp_eq_u32_e64 s[0:1], 0, v10 v_lshlrev_b32_e32 v10, 2, v8 v_cmp_eq_u32_e32 vcc, 0, v9 v_cmp_gt_i32_e64 s[2:3], s64, v8 s_mul_i32 s4, s40, s44 s_mul_hi_u32 s5, s40, s6 s_and_b64 s[54:55], vcc, s[2:3] v_mov_b32_e32 v3, s39 v_add_co_u32_e32 v196, vcc, s38, v10 s_mul_i32 s42, s41, s6 s_add_i32 s4, s5, s4 s_mul_hi_u32 s5, s8, s7 s_mul_i32 s41, s8, s45 v_addc_co_u32_e32 v197, vcc, 0, v3, vcc s_add_i32 s5, s5, s41 s_mul_i32 s9, s9, s7 v_mov_b32_e32 v3, s23 v_add_co_u32_e32 v199, vcc, s22, v10 s_mul_i32 s40, s40, s6 s_add_i32 s4, s4, s42 s_add_i32 s5, s5, s9 s_mul_i32 s8, s8, s7 v_addc_co_u32_e32 v200, vcc, 0, v3, vcc v_lshlrev_b32_e32 v3, 9, v8 s_add_u32 s65, s8, s40 v_add_u32_e32 v198, v4, v6 v_lshrrev_b32_e32 v202, 6, v0 v_and_b32_e32 v0, 0x7f0, v5 v_or_b32_e32 v4, v3, v163 v_or_b32_e32 v5, 0x80, v163 v_or_b32_e32 v6, 0x100, v163 v_add_u32_e32 v207, 0, v7 v_or_b32_e32 v7, 0x180, v163 v_add_u32_e32 v8, 0, v11 v_lshlrev_b64 v[165:166], 1, v[1:2] v_mbcnt_lo_u32_b32 v225, -1, 0 v_or_b32_e32 v174, 64, v173 v_or_b32_e32 v175, 0x80, v173 v_or_b32_e32 v176, 0xc0, v173 v_or_b32_e32 v177, 0x100, v173 v_or_b32_e32 v178, 0x140, v173 v_or_b32_e32 v179, 0x180, v173 v_or_b32_e32 v180, 0x1c0, v173 v_or_b32_e32 v181, 0x200, v173 v_sub_u32_e32 v185, v184, v10 v_or_b32_e32 v186, 1, v182 v_or_b32_e32 v187, 2, v182 v_or_b32_e32 v188, 3, v182 v_or_b32_e32 v189, 32, v182 v_or_b32_e32 v190, 33, v182 v_or_b32_e32 v191, 34, v182 v_or_b32_e32 v192, 35, v182 v_add_u32_e32 v193, 0x400, v171 s_addc_u32 s66, s5, s4 v_add_u32_e32 v201, 0x800, v171 s_mov_b32 s40, 0 v_add_u32_e32 v203, 0xc00, v171 v_add_u32_e32 v204, 0x2000, v171 v_cmp_eq_u32_e64 s[4:5], 0, v170 v_sub_u32_e32 v205, v195, v10 v_add_u32_e32 v206, 0x2400, v171 s_mov_b32 s47, 0x20000 s_brev_b32 s46, 1 s_mov_b32 s67, 0xff800000 v_add_u32_e32 v208, 0, v0 v_lshlrev_b32_e32 v209, 2, v4 v_add_lshl_u32 v210, v3, v5, 2 v_add_lshl_u32 v211, v3, v6, 2 v_add_lshl_u32 v212, v3, v7, 2 v_add_lshl_u32 v213, v3, v163, 2 v_add_u32_e32 v214, 0xa000, v8 v_add_u32_e32 v215, 0x2800, v171 v_add_u32_e32 v216, 0x2c00, v171 s_mov_b32 s22, s12 v_add_u32_e32 v217, 0x4000, v171 v_add_u32_e32 v218, 0x4400, v171 v_add_u32_e32 v219, 0x4800, v171 v_add_u32_e32 v220, 0x4c00, v171 v_add_u32_e32 v221, 0x6000, v171 v_add_u32_e32 v222, 0x6400, v171 v_add_u32_e32 v223, 0x6800, v171 v_add_u32_e32 v224, 0x6c00, v171 v_mbcnt_hi_u32_b32 v226, -1, v225 s_branch .LBB10_5 .LBB10_3: ; %Flow909 ; in Loop: Header=BB10_5 Depth=1 s_or_b64 exec, exec, s[8:9] .LBB10_4: ; %_ZN5flash49compute_attn_1rowblock_splitkv_mla_fp8_gfx938_TP4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEvRK20Flash_fwd_mla_paramsiiiiiiibRT1_fff.exit ; in Loop: Header=BB10_5 Depth=1 s_add_i32 s8, s22, 1 s_cmp_lt_i32 s22, s14 s_mov_b32 s22, s8 s_cbranch_scc0 .LBB10_42 .LBB10_5: ; =>This Loop Header: Depth=1 ; Child Loop BB10_9 Depth 2 s_ashr_i32 s23, s22, 31 s_lshl_b64 s[56:57], s[22:23], 2 s_add_u32 s8, s16, s56 s_addc_u32 s9, s17, s57 global_load_dword v0, v164, s[8:9] s_cmp_le_i32 s22, s12 s_waitcnt vmcnt(0) v_readfirstlane_b32 s68, v0 s_cbranch_scc1 .LBB10_7 ; %bb.6: ; in Loop: Header=BB10_5 Depth=1 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) .LBB10_7: ; in Loop: Header=BB10_5 Depth=1 s_cmp_eq_u32 s22, s12 s_cselect_b64 s[38:39], -1, 0 s_and_b64 s[8:9], s[38:39], exec s_cselect_b32 s69, s13, 0 s_cmp_eq_u32 s22, s14 s_cselect_b32 s8, s15, s68 s_add_i32 s8, s8, 63 s_ashr_i32 s9, s8, 31 s_lshr_b32 s9, s9, 26 s_add_i32 s8, s8, s9 s_ashr_i32 s58, s8, 6 s_mul_i32 s8, s22, s49 s_mul_hi_u32 s9, s22, s48 s_add_i32 s8, s9, s8 s_mul_i32 s9, s23, s48 s_add_i32 s8, s8, s9 s_mul_i32 s9, s22, s48 s_add_u32 s9, s27, s9 s_addc_u32 s8, s60, s8 s_add_u32 s44, s18, s9 s_addc_u32 s45, s19, s8 v_readfirstlane_b32 s8, v202 s_lshr_b32 s9, s8, 31 s_add_i32 s9, s8, s9 s_ashr_i32 s41, s9, 1 s_and_b32 s9, s9, -2 s_sub_i32 s42, s8, s9 v_lshl_or_b32 v3, s42, 4, v169 v_lshl_or_b32 v0, s41, 6, v170 v_mad_u64_u32 v[1:2], s[8:9], v3, s26, v[0:1] s_lshl_b32 s8, s42, 10 s_lshl_b32 s9, s41, 11 v_cmp_gt_i32_e32 vcc, s64, v3 s_add_i32 s8, s8, s9 v_cndmask_b32_e32 v2, -1, v1, vcc s_add_i32 s41, s8, 0 ;;#ASMSTART s_mov_b32 m0, s41 buffer_load_dwordx4 v2, s[44:47], s40 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v2, 0x80, v1 v_cndmask_b32_e32 v2, -1, v2, vcc s_add_i32 s8, s41, 0x1000 ;;#ASMSTART s_mov_b32 m0, s8 buffer_load_dwordx4 v2, s[44:47], s40 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v2, 0x100, v1 v_cndmask_b32_e32 v2, -1, v2, vcc s_add_i32 s8, s41, 0x2000 ;;#ASMSTART s_mov_b32 m0, s8 buffer_load_dwordx4 v2, s[44:47], s40 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v2, 0x180, v1 v_cndmask_b32_e32 v2, -1, v2, vcc s_add_i32 s8, s41, 0x3000 ;;#ASMSTART s_mov_b32 m0, s8 buffer_load_dwordx4 v2, s[44:47], s40 ,offen offset:0, lds ;;#ASMEND v_cmp_gt_i32_e64 s[8:9], 64, v0 v_add_u32_e32 v1, 0x200, v1 s_and_b64 vcc, s[8:9], vcc v_cndmask_b32_e32 v0, -1, v1, vcc s_addk_i32 s41, 0x4000 ;;#ASMSTART s_mov_b32 m0, s41 buffer_load_dwordx4 v0, s[44:47], s40 ,offen offset:0, lds ;;#ASMEND ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND ds_read_b128 v[64:67], v198 ds_read_b128 v[68:71], v198 offset:2048 ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_b128 v[72:75], v198 offset:4096 ds_read_b128 v[76:79], v198 offset:6144 ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_b128 v[80:83], v198 offset:8192 ds_read_b128 v[84:87], v198 offset:10240 ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_b128 v[88:91], v198 offset:12288 ds_read_b128 v[92:95], v198 offset:14336 ;;#ASMSTART s_waitcnt vmcnt(0) s_barrier ;;#ASMEND ds_read_b128 v[96:99], v198 offset:16384 v_mov_b32_e32 v63, 0 s_cmp_le_i32 s58, s69 v_mov_b32_e32 v62, v63 v_mov_b32_e32 v61, v63 v_mov_b32_e32 v60, v63 v_mov_b32_e32 v59, v63 v_mov_b32_e32 v58, v63 v_mov_b32_e32 v57, v63 v_mov_b32_e32 v56, v63 v_mov_b32_e32 v55, v63 v_mov_b32_e32 v54, v63 v_mov_b32_e32 v53, v63 v_mov_b32_e32 v52, v63 v_mov_b32_e32 v51, v63 v_mov_b32_e32 v50, v63 v_mov_b32_e32 v49, v63 v_mov_b32_e32 v48, v63 v_mov_b32_e32 v47, v63 v_mov_b32_e32 v46, v63 v_mov_b32_e32 v45, v63 v_mov_b32_e32 v44, v63 v_mov_b32_e32 v43, v63 v_mov_b32_e32 v42, v63 v_mov_b32_e32 v41, v63 v_mov_b32_e32 v40, v63 v_mov_b32_e32 v39, v63 v_mov_b32_e32 v38, v63 v_mov_b32_e32 v37, v63 v_mov_b32_e32 v36, v63 v_mov_b32_e32 v35, v63 v_mov_b32_e32 v34, v63 v_mov_b32_e32 v33, v63 v_mov_b32_e32 v32, v63 v_mov_b32_e32 v31, v63 v_mov_b32_e32 v30, v63 v_mov_b32_e32 v29, v63 v_mov_b32_e32 v28, v63 v_mov_b32_e32 v27, v63 v_mov_b32_e32 v26, v63 v_mov_b32_e32 v25, v63 v_mov_b32_e32 v24, v63 v_mov_b32_e32 v23, v63 v_mov_b32_e32 v22, v63 v_mov_b32_e32 v21, v63 v_mov_b32_e32 v20, v63 v_mov_b32_e32 v19, v63 v_mov_b32_e32 v18, v63 v_mov_b32_e32 v17, v63 v_mov_b32_e32 v16, v63 v_mov_b32_e32 v15, v63 v_mov_b32_e32 v14, v63 v_mov_b32_e32 v13, v63 v_mov_b32_e32 v12, v63 v_mov_b32_e32 v11, v63 v_mov_b32_e32 v10, v63 v_mov_b32_e32 v9, v63 v_mov_b32_e32 v8, v63 v_mov_b32_e32 v7, v63 v_mov_b32_e32 v6, v63 v_mov_b32_e32 v5, v63 v_mov_b32_e32 v4, v63 v_mov_b32_e32 v3, v63 v_mov_b32_e32 v2, v63 v_mov_b32_e32 v1, v63 v_mov_b32_e32 v0, v63 v_mov_b32_e32 v227, v63 v_mov_b32_e32 v228, v63 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_cbranch_scc1 .LBB10_23 ; %bb.8: ; %.lr.ph.i.preheader ; in Loop: Header=BB10_5 Depth=1 s_mul_i32 s8, s22, s31 s_mul_hi_u32 s9, s22, s30 s_add_i32 s8, s9, s8 s_mul_i32 s9, s23, s30 s_add_i32 s9, s8, s9 s_mul_i32 s8, s22, s30 s_lshl_b64 s[8:9], s[8:9], 2 s_add_u32 s41, s10, s8 s_addc_u32 s42, s11, s9 s_lshl_b32 s8, s58, 6 s_sub_i32 s8, s68, s8 s_ashr_i32 s59, s58, 31 s_add_i32 s70, s8, 64 s_lshl_b64 s[8:9], s[58:59], 2 s_add_u32 s8, s41, s8 s_addc_u32 s9, s42, s9 s_add_u32 s59, s8, -4 v_mov_b32_e32 v227, 0 s_addc_u32 s71, s9, -1 s_mov_b64 s[8:9], 0 s_mov_b32 s72, s58 v_mov_b32_e32 v228, 0 v_mov_b32_e32 v0, 0 v_mov_b32_e32 v1, v227 v_mov_b32_e32 v2, v227 v_mov_b32_e32 v3, v227 v_mov_b32_e32 v4, 0 v_mov_b32_e32 v5, v227 v_mov_b32_e32 v6, v227 v_mov_b32_e32 v7, v227 v_mov_b32_e32 v8, 0 v_mov_b32_e32 v9, v227 v_mov_b32_e32 v10, v227 v_mov_b32_e32 v11, v227 v_mov_b32_e32 v12, 0 v_mov_b32_e32 v13, v227 v_mov_b32_e32 v14, v227 v_mov_b32_e32 v15, v227 v_mov_b32_e32 v16, 0 v_mov_b32_e32 v17, v227 v_mov_b32_e32 v18, v227 v_mov_b32_e32 v19, v227 v_mov_b32_e32 v20, 0 v_mov_b32_e32 v21, v227 v_mov_b32_e32 v22, v227 v_mov_b32_e32 v23, v227 v_mov_b32_e32 v24, 0 v_mov_b32_e32 v25, v227 v_mov_b32_e32 v26, v227 v_mov_b32_e32 v27, v227 v_mov_b32_e32 v28, 0 v_mov_b32_e32 v29, v227 v_mov_b32_e32 v30, v227 v_mov_b32_e32 v31, v227 v_mov_b32_e32 v32, 0 v_mov_b32_e32 v33, v227 v_mov_b32_e32 v34, v227 v_mov_b32_e32 v35, v227 v_mov_b32_e32 v36, 0 v_mov_b32_e32 v37, v227 v_mov_b32_e32 v38, v227 v_mov_b32_e32 v39, v227 v_mov_b32_e32 v40, 0 v_mov_b32_e32 v41, v227 v_mov_b32_e32 v42, v227 v_mov_b32_e32 v43, v227 v_mov_b32_e32 v44, 0 v_mov_b32_e32 v45, v227 v_mov_b32_e32 v46, v227 v_mov_b32_e32 v47, v227 v_mov_b32_e32 v48, 0 v_mov_b32_e32 v49, v227 v_mov_b32_e32 v50, v227 v_mov_b32_e32 v51, v227 v_mov_b32_e32 v52, 0 v_mov_b32_e32 v53, v227 v_mov_b32_e32 v54, v227 v_mov_b32_e32 v55, v227 v_mov_b32_e32 v56, 0 v_mov_b32_e32 v57, v227 v_mov_b32_e32 v58, v227 v_mov_b32_e32 v59, v227 v_mov_b32_e32 v60, 0 v_mov_b32_e32 v61, v227 v_mov_b32_e32 v62, v227 v_mov_b32_e32 v63, v227 ; implicit-def: $vgpr230 ; implicit-def: $vgpr229 .LBB10_9: ; %.lr.ph.i ; Parent Loop BB10_5 Depth=1 ; => This Inner Loop Header: Depth=2 s_add_u32 s42, s59, s8 s_addc_u32 s43, s71, s9 ;;#ASMSTART s_load_dword s41, s[42:43], 0x0 s_waitcnt lgkmcnt(0) ;;#ASMEND s_ashr_i32 s42, s41, 31 s_mul_i32 s43, s41, s51 s_mul_hi_u32 s44, s41, s50 s_add_i32 s43, s44, s43 s_mul_i32 s42, s42, s50 s_add_i32 s43, s43, s42 v_readfirstlane_b32 s42, v202 v_lshl_or_b32 v100, s42, 4, v172 v_mul_lo_u32 v101, v100, s33 s_mul_i32 s41, s41, s50 s_add_u32 s44, s62, s41 s_addc_u32 s45, s63, s43 v_add_u32_e32 v102, v101, v173 v_cmp_gt_i32_e32 vcc, s70, v100 s_lshl_b32 s41, s42, 10 v_cndmask_b32_e32 v100, -1, v102, vcc s_add_i32 s41, s41, 0 ;;#ASMSTART s_mov_b32 m0, s41 buffer_load_dwordx4 v100, s[44:47], s40 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v174 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s42, s41, 0x1000 ;;#ASMSTART s_mov_b32 m0, s42 buffer_load_dwordx4 v100, s[44:47], s40 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v175 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s42, s41, 0x2000 ;;#ASMSTART s_mov_b32 m0, s42 buffer_load_dwordx4 v100, s[44:47], s40 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v176 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s42, s41, 0x3000 ;;#ASMSTART s_mov_b32 m0, s42 buffer_load_dwordx4 v100, s[44:47], s40 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v177 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s42, s41, 0x4000 ;;#ASMSTART s_mov_b32 m0, s42 buffer_load_dwordx4 v100, s[44:47], s40 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v178 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s42, s41, 0x5000 ;;#ASMSTART s_mov_b32 m0, s42 buffer_load_dwordx4 v100, s[44:47], s40 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v179 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s42, s41, 0x6000 ;;#ASMSTART s_mov_b32 m0, s42 buffer_load_dwordx4 v100, s[44:47], s40 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v180 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s42, s41, 0x7000 ;;#ASMSTART s_mov_b32 m0, s42 buffer_load_dwordx4 v100, s[44:47], s40 ,offen offset:0, lds ;;#ASMEND v_add_u32_e32 v100, v101, v181 v_cndmask_b32_e32 v100, -1, v100, vcc s_add_i32 s41, s41, 0x8000 ;;#ASMSTART s_mov_b32 m0, s41 buffer_load_dwordx4 v100, s[44:47], s40 ,offen offset:0, lds ;;#ASMEND ;;#ASMSTART s_waitcnt vmcnt(8) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 ds_read_b128 v[104:107], v207 offset:2048 s_mov_b32 s42, s40 s_mov_b32 s43, s40 s_mov_b32 s41, s40 v_mov_b64_e32 v[110:111], s[42:43] v_mov_b64_e32 v[108:109], s[40:41] v_mov_b64_e32 v[114:115], v[110:111] v_mov_b64_e32 v[112:113], v[108:109] s_waitcnt lgkmcnt(1) s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[64:65], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[64:65], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[66:67], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[66:67], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(7) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:4096 ds_read_b128 v[104:107], v207 offset:6144 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[68:69], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[68:69], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[70:71], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[70:71], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(6) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:8192 ds_read_b128 v[104:107], v207 offset:10240 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[72:73], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[72:73], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[74:75], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[74:75], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(5) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:12288 ds_read_b128 v[104:107], v207 offset:14336 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[76:77], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[76:77], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[78:79], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[78:79], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:16384 ds_read_b128 v[104:107], v207 offset:18432 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[80:81], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[80:81], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[82:83], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[82:83], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:20480 ds_read_b128 v[104:107], v207 offset:22528 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[84:85], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[84:85], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[86:87], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[86:87], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:24576 ds_read_b128 v[104:107], v207 offset:26624 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[88:89], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[88:89], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[90:91], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[90:91], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:28672 ds_read_b128 v[104:107], v207 offset:30720 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[92:93], v[100:101], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[92:93], v[104:105], v[108:111] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[94:95], v[102:103], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[94:95], v[106:107], v[108:111] lit ;;#ASMSTART s_waitcnt vmcnt(0) s_barrier ;;#ASMEND ds_read_b128 v[100:103], v207 offset:32768 ds_read_b128 v[104:107], v207 offset:34816 v_mov_b32_e32 v116, 0xff800000 v_cmp_gt_i32_e32 vcc, s70, v182 s_cmp_lg_u32 s8, 0 s_waitcnt lgkmcnt(1) v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[96:97], v[100:101], v[112:115] lit v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[98:99], v[102:103], v[112:115] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[96:97], v[104:105], v[108:111] lit s_nop 0 v_cndmask_b32_e32 v236, v116, v112, vcc v_cmp_gt_i32_e32 vcc, s70, v186 v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[98:99], v[106:107], v[108:111] lit v_cndmask_b32_e32 v235, v116, v113, vcc v_cmp_gt_i32_e32 vcc, s70, v187 v_cndmask_b32_e32 v233, v116, v114, vcc v_cmp_gt_i32_e32 vcc, s70, v188 v_mov_b64_e32 v[100:101], v[108:109] v_cndmask_b32_e32 v234, v116, v115, vcc v_mov_b64_e32 v[102:103], v[110:111] v_mov_b32_e32 v100, s67 v_cmp_gt_i32_e32 vcc, s70, v189 v_cndmask_b32_e32 v237, v100, v108, vcc v_cndmask_b32_e32 v100, v103, v111, vcc v_cndmask_b32_e32 v102, v102, v110, vcc v_cndmask_b32_e32 v101, v101, v109, vcc v_cmp_gt_i32_e32 vcc, s70, v190 v_cndmask_b32_e32 v238, v116, v101, vcc v_cmp_gt_i32_e32 vcc, s70, v191 v_cndmask_b32_e32 v239, v116, v102, vcc v_cmp_gt_i32_e32 vcc, s70, v192 v_cndmask_b32_e32 v240, v116, v100, vcc s_cbranch_scc0 .LBB10_15 ; %bb.10: ; in Loop: Header=BB10_9 Depth=2 v_and_b32_e32 v101, 63, v226 v_and_b32_e32 v103, 64, v226 v_max3_f32 v100, v228, v236, v235 v_xor_b32_e32 v102, 32, v101 v_add_u32_e32 v103, 64, v103 v_max3_f32 v100, v100, v233, v234 v_cmp_lt_i32_e32 vcc, v102, v103 v_max3_f32 v100, v100, v237, v238 v_cndmask_b32_e32 v102, v226, v102, vcc v_max3_f32 v100, v100, v239, v240 v_lshlrev_b32_e32 v102, 2, v102 ds_bpermute_b32 v102, v102, v100 v_xor_b32_e32 v101, 16, v101 v_cmp_lt_i32_e32 vcc, v101, v103 v_cndmask_b32_e32 v101, v226, v101, vcc v_lshlrev_b32_e32 v101, 2, v101 s_waitcnt lgkmcnt(0) v_max_f32_e32 v100, v100, v102 ds_bpermute_b32 v101, v101, v100 s_and_saveexec_b64 s[42:43], s[4:5] s_cbranch_execz .LBB10_12 ; %bb.11: ; in Loop: Header=BB10_9 Depth=2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v100, v100, v101 ds_write_b32 v183, v100 .LBB10_12: ; in Loop: Header=BB10_9 Depth=2 s_or_b64 exec, exec, s[42:43] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[42:43], s[0:1] s_cbranch_execz .LBB10_14 ; %bb.13: ; in Loop: Header=BB10_9 Depth=2 ds_read_b64 v[100:101], v184 s_waitcnt lgkmcnt(0) v_max_f32_e32 v100, v100, v101 ds_write_b32 v185, v100 offset:256 .LBB10_14: ; %_ZN5flash7SoftmaxILi1EE25softmax_rescale_o_fp8_tp4ILb0ELb0EN4cute6TensorINS3_13array_alignedIfLm8ELm16EEENS3_6LayoutINS3_5tupleIJNS3_1CILi4EEENS9_ILi1EEENS9_ILi2EEEEEENS8_IJSB_NS9_ILi0EEESA_EEEEEEENS4_INS3_10ViewEngineINS3_8smem_ptrIfEEEENS7_INS8_IJNS9_ILi128EEEEEENS8_IJSB_EEEEEEEEEvRT1_RT2_fPDv4_f.exit.i ; in Loop: Header=BB10_9 Depth=2 s_or_b64 exec, exec, s[42:43] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v249, v185 offset:256 s_waitcnt lgkmcnt(0) v_sub_f32_e32 v100, v228, v249 v_mul_f32_e32 v228, v249, v168 v_cmp_lg_f32_e32 vcc, s67, v249 v_cndmask_b32_e32 v228, 0, v228, vcc v_mul_f32_e32 v100, v100, v168 v_fma_f32 v231, v236, v168, -v228 v_exp_f32_e32 v247, v100 v_exp_f32_e32 v241, v231 v_fma_f32 v231, v235, v168, -v228 v_fma_f32 v232, v234, v168, -v228 v_exp_f32_e32 v242, v231 v_fma_f32 v231, v233, v168, -v228 v_exp_f32_e32 v244, v232 v_exp_f32_e32 v231, v231 v_fma_f32 v232, v237, v168, -v228 v_fma_f32 v227, v247, v227, v241 v_exp_f32_e32 v243, v232 v_fma_f32 v232, v238, v168, -v228 v_add_f32_e32 v227, v227, v242 v_exp_f32_e32 v245, v232 v_fma_f32 v232, v239, v168, -v228 v_fma_f32 v228, v240, v168, -v228 v_exp_f32_e32 v232, v232 v_add_f32_e32 v227, v227, v231 v_exp_f32_e32 v246, v228 v_add_f32_e32 v227, v227, v244 v_add_f32_e32 v227, v227, v243 v_add_f32_e32 v227, v227, v245 v_mov_b32_e32 v248, v247 v_add_f32_e32 v227, v227, v232 v_pk_mul_f32 v[100:101], v[247:248], v[60:61] v_pk_mul_f32 v[102:103], v[247:248], v[62:63] v_pk_mul_f32 v[104:105], v[247:248], v[56:57] v_pk_mul_f32 v[106:107], v[247:248], v[58:59] v_pk_mul_f32 v[108:109], v[247:248], v[52:53] v_pk_mul_f32 v[110:111], v[247:248], v[54:55] v_pk_mul_f32 v[112:113], v[247:248], v[48:49] v_pk_mul_f32 v[114:115], v[247:248], v[50:51] v_pk_mul_f32 v[116:117], v[247:248], v[44:45] v_pk_mul_f32 v[118:119], v[247:248], v[46:47] v_pk_mul_f32 v[120:121], v[247:248], v[40:41] v_pk_mul_f32 v[122:123], v[247:248], v[42:43] v_pk_mul_f32 v[124:125], v[247:248], v[36:37] v_pk_mul_f32 v[126:127], v[247:248], v[38:39] v_pk_mul_f32 v[128:129], v[247:248], v[32:33] v_pk_mul_f32 v[130:131], v[247:248], v[34:35] v_pk_mul_f32 v[132:133], v[247:248], v[28:29] v_pk_mul_f32 v[134:135], v[247:248], v[30:31] v_pk_mul_f32 v[136:137], v[247:248], v[24:25] v_pk_mul_f32 v[138:139], v[247:248], v[26:27] v_pk_mul_f32 v[140:141], v[247:248], v[20:21] v_pk_mul_f32 v[142:143], v[247:248], v[22:23] v_pk_mul_f32 v[144:145], v[247:248], v[16:17] v_pk_mul_f32 v[146:147], v[247:248], v[18:19] v_pk_mul_f32 v[148:149], v[247:248], v[12:13] v_pk_mul_f32 v[150:151], v[247:248], v[14:15] v_pk_mul_f32 v[152:153], v[247:248], v[8:9] v_pk_mul_f32 v[154:155], v[247:248], v[10:11] v_pk_mul_f32 v[156:157], v[247:248], v[4:5] v_pk_mul_f32 v[158:159], v[247:248], v[6:7] v_pk_mul_f32 v[160:161], v[247:248], v[0:1] v_pk_mul_f32 v[162:163], v[247:248], v[2:3] v_add_f32_e32 v227, v227, v246 v_mov_b32_e32 v228, v249 s_branch .LBB10_21 .LBB10_15: ; in Loop: Header=BB10_9 Depth=2 ; implicit-def: $vgpr246 ; implicit-def: $vgpr232 ; implicit-def: $vgpr245 ; implicit-def: $vgpr243 ; implicit-def: $vgpr227 ; implicit-def: $vgpr228 ; implicit-def: $vgpr241 ; implicit-def: $vgpr242 ; implicit-def: $vgpr231 ; implicit-def: $vgpr244 ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103 ; implicit-def: $vgpr104_vgpr105_vgpr106_vgpr107 ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111 ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115 ; implicit-def: $vgpr116_vgpr117_vgpr118_vgpr119 ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123 ; implicit-def: $vgpr124_vgpr125_vgpr126_vgpr127 ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131 ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135 ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139 ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143 ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147 ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151 ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155 ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159 ; implicit-def: $vgpr160_vgpr161_vgpr162_vgpr163 s_cbranch_execz .LBB10_21 ; %bb.16: ; in Loop: Header=BB10_9 Depth=2 v_mbcnt_hi_u32_b32 v101, -1, v225 v_and_b32_e32 v102, 63, v101 v_and_b32_e32 v104, 64, v101 v_max_f32_e32 v100, v236, v235 v_xor_b32_e32 v103, 32, v102 v_add_u32_e32 v104, 64, v104 v_max3_f32 v100, v100, v233, v234 v_cmp_lt_i32_e32 vcc, v103, v104 v_max3_f32 v100, v100, v237, v238 v_cndmask_b32_e32 v103, v101, v103, vcc v_max3_f32 v100, v100, v239, v240 v_lshlrev_b32_e32 v103, 2, v103 ds_bpermute_b32 v103, v103, v100 v_xor_b32_e32 v102, 16, v102 v_cmp_lt_i32_e32 vcc, v102, v104 v_cndmask_b32_e32 v101, v101, v102, vcc v_lshlrev_b32_e32 v101, 2, v101 s_waitcnt lgkmcnt(0) v_max_f32_e32 v100, v100, v103 ds_bpermute_b32 v101, v101, v100 s_and_saveexec_b64 s[42:43], s[4:5] s_cbranch_execz .LBB10_18 ; %bb.17: ; in Loop: Header=BB10_9 Depth=2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v100, v100, v101 ds_write_b32 v183, v100 .LBB10_18: ; in Loop: Header=BB10_9 Depth=2 s_or_b64 exec, exec, s[42:43] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[42:43], s[0:1] s_cbranch_execz .LBB10_20 ; %bb.19: ; in Loop: Header=BB10_9 Depth=2 ds_read_b64 v[100:101], v184 s_waitcnt lgkmcnt(0) v_max_f32_e32 v100, v100, v101 ds_write_b32 v185, v100 offset:256 .LBB10_20: ; %_ZN5flash7SoftmaxILi1EE25softmax_rescale_o_fp8_tp4ILb1ELb0EN4cute6TensorINS3_13array_alignedIfLm8ELm16EEENS3_6LayoutINS3_5tupleIJNS3_1CILi4EEENS9_ILi1EEENS9_ILi2EEEEEENS8_IJSB_NS9_ILi0EEESA_EEEEEEENS4_INS3_10ViewEngineINS3_8smem_ptrIfEEEENS7_INS8_IJNS9_ILi128EEEEEENS8_IJSB_EEEEEEEEEvRT1_RT2_fPDv4_f.exit.i ; in Loop: Header=BB10_9 Depth=2 s_or_b64 exec, exec, s[42:43] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v228, v185 offset:256 v_mov_b64_e32 v[106:107], v[58:59] v_mov_b64_e32 v[110:111], v[54:55] v_mov_b64_e32 v[114:115], v[50:51] v_mov_b64_e32 v[118:119], v[46:47] s_waitcnt lgkmcnt(0) v_mul_f32_e32 v100, v228, v168 v_cmp_lg_f32_e32 vcc, s67, v228 v_cndmask_b32_e32 v100, 0, v100, vcc v_fma_f32 v101, v236, v168, -v100 v_mov_b64_e32 v[122:123], v[42:43] v_exp_f32_e32 v241, v101 v_fma_f32 v101, v235, v168, -v100 v_mov_b64_e32 v[126:127], v[38:39] v_exp_f32_e32 v242, v101 v_fma_f32 v101, v233, v168, -v100 v_mov_b64_e32 v[130:131], v[34:35] v_exp_f32_e32 v231, v101 v_fma_f32 v101, v234, v168, -v100 v_mov_b64_e32 v[134:135], v[30:31] v_exp_f32_e32 v244, v101 v_fma_f32 v101, v237, v168, -v100 v_mov_b64_e32 v[138:139], v[26:27] v_exp_f32_e32 v243, v101 v_fma_f32 v101, v238, v168, -v100 v_mov_b64_e32 v[142:143], v[22:23] v_exp_f32_e32 v245, v101 v_fma_f32 v101, v239, v168, -v100 v_fma_f32 v100, v240, v168, -v100 v_exp_f32_e32 v232, v101 v_exp_f32_e32 v246, v100 v_add_f32_e32 v100, v242, v241 v_add_f32_e32 v100, v100, v231 v_add_f32_e32 v100, v100, v244 v_add_f32_e32 v100, v100, v243 v_add_f32_e32 v100, v100, v245 v_add_f32_e32 v100, v100, v232 v_add_f32_e32 v227, v100, v246 v_mov_b64_e32 v[102:103], v[62:63] v_mov_b64_e32 v[146:147], v[18:19] v_mov_b64_e32 v[150:151], v[14:15] v_mov_b64_e32 v[154:155], v[10:11] v_mov_b64_e32 v[158:159], v[6:7] v_mov_b64_e32 v[162:163], v[2:3] v_mov_b64_e32 v[100:101], v[60:61] v_mov_b64_e32 v[104:105], v[56:57] v_mov_b64_e32 v[108:109], v[52:53] v_mov_b64_e32 v[112:113], v[48:49] v_mov_b64_e32 v[116:117], v[44:45] v_mov_b64_e32 v[120:121], v[40:41] v_mov_b64_e32 v[124:125], v[36:37] v_mov_b64_e32 v[128:129], v[32:33] v_mov_b64_e32 v[132:133], v[28:29] v_mov_b64_e32 v[136:137], v[24:25] v_mov_b64_e32 v[140:141], v[20:21] v_mov_b64_e32 v[144:145], v[16:17] v_mov_b64_e32 v[148:149], v[12:13] v_mov_b64_e32 v[152:153], v[8:9] v_mov_b64_e32 v[156:157], v[4:5] v_mov_b64_e32 v[160:161], v[0:1] .LBB10_21: ; in Loop: Header=BB10_9 Depth=2 s_add_i32 s72, s72, -1 ; sched_barrier mask(0x00000000) v_cvt_pk_fp8_f32 v241, v241, v242, v230 v_cvt_pk_fp8_f32 v243, v243, v245, v229 v_cvt_pk_fp8_f32 v231, v231, v244, v241 op_sel:[0,0,0,1] v_cvt_pk_fp8_f32 v232, v232, v246, v243 op_sel:[0,0,0,1] ds_write2_b32 v214, v231, v232 offset1:2 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b128 v[0:3], v208 offset:40960 ; sched_barrier mask(0x00000000) ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[4:6:8:10], v171 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[5:7:9:11], v193 ds_read_m64x16_b8_alt4 v[12:14:16:18], v201 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[13:15:17:19], v203 v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[0:1], v[4:5], v[100:103] lit v_mov_b32_e32 v4, v8 v_mov_b32_e32 v5, v9 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[0:1], v[4:5], v[108:111] lit v_mov_b32_e32 v4, v10 v_mov_b32_e32 v5, v11 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[0:1], v[4:5], v[112:115] lit v_mov_b32_e32 v4, v12 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v5, v13 v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[0:1], v[6:7], v[104:107] lit s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[100:103], v[2:3], v[4:5], v[100:103] lit v_mov_b32_e32 v4, v14 v_mov_b32_e32 v5, v15 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[104:107], v[2:3], v[4:5], v[104:107] lit v_mov_b32_e32 v4, v16 v_mov_b32_e32 v5, v17 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[108:111], v[2:3], v[4:5], v[108:111] lit ds_read_m64x16_b8_alt4 v[4:6:8:10], v204 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[5:7:9:11], v206 v_mov_b32_e32 v12, v18 v_mov_b32_e32 v13, v19 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[112:115], v[2:3], v[12:13], v[112:115] lit ds_read_m64x16_b8_alt4 v[12:14:16:18], v215 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[13:15:17:19], v216 v_mmac_f32_16x16x32_fp8_fp8 v[116:119], v[0:1], v[4:5], v[116:119] lit v_mov_b32_e32 v4, v6 v_mov_b32_e32 v5, v7 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[120:123], v[0:1], v[4:5], v[120:123] lit v_mov_b32_e32 v4, v8 v_mov_b32_e32 v5, v9 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[124:127], v[0:1], v[4:5], v[124:127] lit v_mov_b32_e32 v4, v10 v_mov_b32_e32 v5, v11 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[128:131], v[0:1], v[4:5], v[128:131] lit v_mov_b32_e32 v4, v12 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v5, v13 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[116:119], v[2:3], v[4:5], v[116:119] lit v_mov_b32_e32 v4, v14 v_mov_b32_e32 v5, v15 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[120:123], v[2:3], v[4:5], v[120:123] lit v_mov_b32_e32 v4, v16 v_mov_b32_e32 v5, v17 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[124:127], v[2:3], v[4:5], v[124:127] lit ds_read_m64x16_b8_alt4 v[4:6:8:10], v217 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[5:7:9:11], v218 v_mov_b32_e32 v12, v18 v_mov_b32_e32 v13, v19 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[128:131], v[2:3], v[12:13], v[128:131] lit ds_read_m64x16_b8_alt4 v[12:14:16:18], v219 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[13:15:17:19], v220 v_mmac_f32_16x16x32_fp8_fp8 v[132:135], v[0:1], v[4:5], v[132:135] lit v_mov_b32_e32 v4, v6 v_mov_b32_e32 v5, v7 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[136:139], v[0:1], v[4:5], v[136:139] lit v_mov_b32_e32 v4, v8 v_mov_b32_e32 v5, v9 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[140:143], v[0:1], v[4:5], v[140:143] lit v_mov_b32_e32 v4, v10 v_mov_b32_e32 v5, v11 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[144:147], v[0:1], v[4:5], v[144:147] lit v_mov_b32_e32 v4, v12 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v5, v13 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[132:135], v[2:3], v[4:5], v[132:135] lit v_mov_b32_e32 v4, v14 v_mov_b32_e32 v5, v15 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[136:139], v[2:3], v[4:5], v[136:139] lit v_mov_b32_e32 v4, v16 v_mov_b32_e32 v5, v17 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[140:143], v[2:3], v[4:5], v[140:143] lit ds_read_m64x16_b8_alt4 v[4:6:8:10], v221 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[5:7:9:11], v222 v_mov_b32_e32 v12, v18 v_mov_b32_e32 v13, v19 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[144:147], v[2:3], v[12:13], v[144:147] lit ds_read_m64x16_b8_alt4 v[12:14:16:18], v223 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[13:15:17:19], v224 v_mmac_f32_16x16x32_fp8_fp8 v[148:151], v[0:1], v[4:5], v[148:151] lit v_mov_b32_e32 v4, v6 v_mov_b32_e32 v5, v7 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[152:155], v[0:1], v[4:5], v[152:155] lit v_mov_b32_e32 v4, v8 v_mov_b32_e32 v5, v9 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[156:159], v[0:1], v[4:5], v[156:159] lit v_mov_b32_e32 v4, v10 v_mov_b32_e32 v5, v11 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[160:163], v[0:1], v[4:5], v[160:163] lit v_mov_b32_e32 v0, v12 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v1, v13 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[148:151], v[2:3], v[0:1], v[148:151] lit v_mov_b32_e32 v0, v14 v_mov_b32_e32 v1, v15 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[152:155], v[2:3], v[0:1], v[152:155] lit v_mov_b32_e32 v0, v16 v_mov_b32_e32 v1, v17 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[156:159], v[2:3], v[0:1], v[156:159] lit v_mov_b32_e32 v0, v18 v_mov_b32_e32 v1, v19 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[160:163], v[2:3], v[0:1], v[160:163] lit ; sched_barrier mask(0x00000000) s_add_i32 s70, s70, 64 s_add_u32 s8, s8, -4 s_addc_u32 s9, s9, -1 v_mov_b32_e32 v0, v160 v_mov_b32_e32 v1, v161 v_mov_b32_e32 v2, v162 v_mov_b32_e32 v3, v163 v_mov_b32_e32 v4, v156 v_mov_b32_e32 v5, v157 v_mov_b32_e32 v6, v158 v_mov_b32_e32 v7, v159 v_mov_b32_e32 v8, v152 v_mov_b32_e32 v9, v153 v_mov_b32_e32 v10, v154 v_mov_b32_e32 v11, v155 v_mov_b32_e32 v12, v148 v_mov_b32_e32 v13, v149 v_mov_b32_e32 v14, v150 v_mov_b32_e32 v15, v151 v_mov_b32_e32 v16, v144 v_mov_b32_e32 v17, v145 v_mov_b32_e32 v18, v146 v_mov_b32_e32 v19, v147 v_mov_b32_e32 v20, v140 v_mov_b32_e32 v21, v141 v_mov_b32_e32 v22, v142 v_mov_b32_e32 v23, v143 v_mov_b32_e32 v24, v136 v_mov_b32_e32 v25, v137 v_mov_b32_e32 v26, v138 v_mov_b32_e32 v27, v139 v_mov_b32_e32 v28, v132 v_mov_b32_e32 v29, v133 v_mov_b32_e32 v30, v134 v_mov_b32_e32 v31, v135 v_mov_b32_e32 v32, v128 v_mov_b32_e32 v33, v129 v_mov_b32_e32 v34, v130 v_mov_b32_e32 v35, v131 v_mov_b32_e32 v36, v124 v_mov_b32_e32 v37, v125 v_mov_b32_e32 v38, v126 v_mov_b32_e32 v39, v127 v_mov_b32_e32 v40, v120 v_mov_b32_e32 v41, v121 v_mov_b32_e32 v42, v122 v_mov_b32_e32 v43, v123 v_mov_b32_e32 v44, v116 v_mov_b32_e32 v45, v117 v_mov_b32_e32 v46, v118 v_mov_b32_e32 v47, v119 v_mov_b32_e32 v48, v112 v_mov_b32_e32 v49, v113 v_mov_b32_e32 v50, v114 v_mov_b32_e32 v51, v115 v_mov_b32_e32 v52, v108 v_mov_b32_e32 v53, v109 v_mov_b32_e32 v54, v110 v_mov_b32_e32 v55, v111 v_mov_b32_e32 v56, v104 v_mov_b32_e32 v57, v105 v_mov_b32_e32 v58, v106 v_mov_b32_e32 v59, v107 v_mov_b32_e32 v60, v100 v_mov_b32_e32 v61, v101 v_mov_b32_e32 v62, v102 s_cmp_le_i32 s72, s69 v_mov_b32_e32 v63, v103 s_cbranch_scc1 .LBB10_23 ; %bb.22: ; in Loop: Header=BB10_9 Depth=2 v_mov_b32_e32 v230, v231 v_mov_b32_e32 v229, v232 s_branch .LBB10_9 .LBB10_23: ; %Flow916 ; in Loop: Header=BB10_5 Depth=1 s_cmp_eq_u32 s69, 0 s_cselect_b64 s[8:9], -1, 0 s_add_i32 s41, s68, 63 s_ashr_i32 s42, s41, 31 s_lshr_b32 s42, s42, 26 s_add_i32 s41, s41, s42 s_ashr_i32 s41, s41, 6 s_cmp_eq_u32 s58, s41 s_cselect_b64 s[42:43], -1, 0 s_and_b64 s[8:9], s[8:9], s[42:43] s_andn2_b64 vcc, exec, s[8:9] s_mov_b64 s[8:9], -1 s_cbranch_vccnz .LBB10_25 ; %bb.24: ; %Flow910 ; in Loop: Header=BB10_5 Depth=1 s_and_b64 vcc, exec, s[8:9] s_cbranch_vccz .LBB10_4 s_branch .LBB10_34 .LBB10_25: ; in Loop: Header=BB10_5 Depth=1 s_add_u32 s8, s36, s56 s_addc_u32 s9, s37, s57 global_load_dword v66, v164, s[8:9] v_mbcnt_hi_u32_b32 v64, -1, v225 v_and_b32_e32 v65, 63, v64 v_and_b32_e32 v68, 64, v64 v_xor_b32_e32 v67, 32, v65 v_add_u32_e32 v68, 64, v68 v_cmp_lt_i32_e32 vcc, v67, v68 v_cndmask_b32_e32 v67, v64, v67, vcc v_lshlrev_b32_e32 v67, 2, v67 ds_bpermute_b32 v67, v67, v227 v_xor_b32_e32 v65, 16, v65 v_cmp_lt_i32_e32 vcc, v65, v68 v_cndmask_b32_e32 v64, v64, v65, vcc v_lshlrev_b32_e32 v65, 2, v64 s_waitcnt lgkmcnt(0) v_add_f32_e32 v64, v67, v227 ds_bpermute_b32 v65, v65, v64 s_waitcnt vmcnt(0) v_readfirstlane_b32 s41, v66 s_and_saveexec_b64 s[8:9], s[4:5] s_cbranch_execz .LBB10_27 ; %bb.26: ; in Loop: Header=BB10_5 Depth=1 s_waitcnt lgkmcnt(0) v_add_f32_e32 v64, v64, v65 ds_write_b32 v194, v64 .LBB10_27: ; in Loop: Header=BB10_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB10_29 ; %bb.28: ; in Loop: Header=BB10_5 Depth=1 ds_read_b64 v[64:65], v195 s_waitcnt lgkmcnt(0) v_add_f32_e32 v64, v65, v64 ds_write_b32 v205, v64 offset:256 .LBB10_29: ; %_ZN5flash7SoftmaxILi1EE29normalize_softmax_lse_fp8_tp4ILb0ELb1ELb1EN4cute6TensorINS3_10ViewEngineINS3_8smem_ptrIfEEEENS3_6LayoutINS3_5tupleIJNS3_1CILi128EEEEEENSA_IJNSB_ILi1EEEEEEEEEEEENS4_INS3_13array_alignedIfLm1ELm16EEENS9_ISF_NSA_IJNSB_ILi0EEEEEEEEEEPDv4_fRT2_fff.exit.i ; in Loop: Header=BB10_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_and_b64 s[8:9], s[38:39], exec s_cselect_b32 s8, s61, 0 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v64, v205 offset:256 s_add_i32 s8, s41, s8 s_mul_i32 s8, s8, s29 s_add_i32 s8, s8, s7 s_mul_i32 s8, s8, s35 s_add_i32 s38, s8, s6 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, 0, v64 s_and_saveexec_b64 s[42:43], s[54:55] s_cbranch_execz .LBB10_31 ; %bb.30: ; in Loop: Header=BB10_5 Depth=1 v_log_f32_e32 v65, v64 s_ashr_i32 s39, s38, 31 v_mov_b32_e32 v66, 0xff800000 s_lshl_b64 s[8:9], s[38:39], 2 v_mul_f32_e32 v65, 0x3f317218, v65 v_fmac_f32_e32 v65, v228, v167 v_cndmask_b32_e32 v67, v65, v66, vcc v_mov_b32_e32 v66, s9 v_add_co_u32_e64 v65, s[8:9], s8, v196 v_addc_co_u32_e64 v66, s[8:9], v197, v66, s[8:9] global_store_dword v[65:66], v67, off .LBB10_31: ; %.loopexit708.i ; in Loop: Header=BB10_5 Depth=1 s_or_b64 exec, exec, s[42:43] s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execz .LBB10_33 ; %bb.32: ; %.preheader705.i ; in Loop: Header=BB10_5 Depth=1 v_rcp_f32_e32 v67, v64 s_mul_i32 s38, s38, s28 v_mov_b32_e32 v64, v60 v_mov_b32_e32 v65, v56 v_mul_f32_e32 v67, s34, v67 v_cndmask_b32_e64 v122, v67, 1.0, vcc v_mov_b32_e32 v123, v122 v_mov_b32_e32 v78, v45 v_mov_b32_e32 v79, v41 v_mov_b32_e32 v94, v29 v_mov_b32_e32 v95, v25 v_mov_b32_e32 v110, v13 v_mov_b32_e32 v111, v9 s_ashr_i32 s39, s38, 31 v_pk_mul_f32 v[124:125], v[122:123], v[64:65] v_mov_b32_e32 v64, v62 v_mov_b32_e32 v65, v58 v_pk_mul_f32 v[80:81], v[122:123], v[78:79] v_mov_b32_e32 v78, v46 v_mov_b32_e32 v79, v42 v_pk_mul_f32 v[96:97], v[122:123], v[94:95] v_mov_b32_e32 v94, v30 v_mov_b32_e32 v95, v26 v_pk_mul_f32 v[112:113], v[122:123], v[110:111] v_mov_b32_e32 v110, v14 v_mov_b32_e32 v111, v10 s_lshl_b64 s[38:39], s[38:39], 2 v_pk_mul_f32 v[70:71], v[122:123], v[64:65] v_mov_b32_e32 v64, v63 v_mov_b32_e32 v65, v59 v_mov_b32_e32 v76, v55 v_mov_b32_e32 v77, v51 v_pk_mul_f32 v[84:85], v[122:123], v[78:79] v_mov_b32_e32 v78, v47 v_mov_b32_e32 v79, v43 v_pk_mul_f32 v[100:101], v[122:123], v[94:95] v_mov_b32_e32 v94, v31 v_mov_b32_e32 v95, v27 v_pk_mul_f32 v[116:117], v[122:123], v[110:111] v_mov_b32_e32 v110, v15 v_mov_b32_e32 v111, v11 s_add_u32 s38, s52, s38 v_mov_b32_e32 v66, v61 v_mov_b32_e32 v67, v57 v_pk_mul_f32 v[74:75], v[122:123], v[64:65] v_mov_b32_e32 v64, v52 v_mov_b32_e32 v65, v48 v_mov_b32_e32 v68, v53 v_mov_b32_e32 v69, v49 v_mov_b32_e32 v72, v54 v_mov_b32_e32 v73, v50 v_pk_mul_f32 v[126:127], v[122:123], v[76:77] v_mov_b32_e32 v76, v44 v_mov_b32_e32 v77, v40 v_pk_mul_f32 v[88:89], v[122:123], v[78:79] v_mov_b32_e32 v78, v36 v_mov_b32_e32 v79, v32 v_mov_b32_e32 v82, v37 v_mov_b32_e32 v83, v33 v_mov_b32_e32 v86, v38 v_mov_b32_e32 v87, v34 v_mov_b32_e32 v90, v39 v_mov_b32_e32 v91, v35 v_mov_b32_e32 v92, v28 v_mov_b32_e32 v93, v24 v_pk_mul_f32 v[104:105], v[122:123], v[94:95] v_mov_b32_e32 v94, v20 v_mov_b32_e32 v95, v16 v_mov_b32_e32 v98, v21 v_mov_b32_e32 v99, v17 v_mov_b32_e32 v102, v22 v_mov_b32_e32 v103, v18 v_mov_b32_e32 v106, v23 v_mov_b32_e32 v107, v19 v_mov_b32_e32 v108, v12 v_mov_b32_e32 v109, v8 v_pk_mul_f32 v[120:121], v[122:123], v[110:111] v_mov_b32_e32 v110, v4 v_mov_b32_e32 v111, v0 v_mov_b32_e32 v114, v5 v_mov_b32_e32 v115, v1 v_mov_b32_e32 v118, v6 v_mov_b32_e32 v119, v2 v_mov_b32_e32 v128, v7 v_mov_b32_e32 v129, v3 s_addc_u32 s39, s53, s39 v_pk_mul_f32 v[66:67], v[122:123], v[66:67] v_pk_mul_f32 v[64:65], v[122:123], v[64:65] v_pk_mul_f32 v[68:69], v[122:123], v[68:69] v_pk_mul_f32 v[72:73], v[122:123], v[72:73] v_pk_mul_f32 v[76:77], v[122:123], v[76:77] v_pk_mul_f32 v[78:79], v[122:123], v[78:79] v_pk_mul_f32 v[82:83], v[122:123], v[82:83] v_pk_mul_f32 v[86:87], v[122:123], v[86:87] v_pk_mul_f32 v[90:91], v[122:123], v[90:91] v_pk_mul_f32 v[92:93], v[122:123], v[92:93] v_pk_mul_f32 v[94:95], v[122:123], v[94:95] v_pk_mul_f32 v[98:99], v[122:123], v[98:99] v_pk_mul_f32 v[102:103], v[122:123], v[102:103] v_pk_mul_f32 v[106:107], v[122:123], v[106:107] v_pk_mul_f32 v[108:109], v[122:123], v[108:109] v_pk_mul_f32 v[110:111], v[122:123], v[110:111] v_pk_mul_f32 v[114:115], v[122:123], v[114:115] v_pk_mul_f32 v[118:119], v[122:123], v[118:119] v_pk_mul_f32 v[122:123], v[122:123], v[128:129] global_store_dwordx2 v209, v[124:125], s[38:39] global_store_dwordx4 v213, v[64:67], s[38:39] offset:8 global_store_dwordx4 v213, v[68:71], s[38:39] offset:24 global_store_dwordx4 v213, v[72:75], s[38:39] offset:40 global_store_dwordx2 v213, v[126:127], s[38:39] offset:56 global_store_dwordx4 v213, v[76:79], s[38:39] offset:512 global_store_dwordx4 v210, v[80:83], s[38:39] offset:16 global_store_dwordx4 v210, v[84:87], s[38:39] offset:32 global_store_dwordx4 v210, v[88:91], s[38:39] offset:48 global_store_dwordx4 v213, v[92:95], s[38:39] offset:1024 global_store_dwordx4 v211, v[96:99], s[38:39] offset:16 global_store_dwordx4 v211, v[100:103], s[38:39] offset:32 global_store_dwordx4 v211, v[104:107], s[38:39] offset:48 global_store_dwordx4 v213, v[108:111], s[38:39] offset:1536 global_store_dwordx4 v212, v[112:115], s[38:39] offset:16 global_store_dwordx4 v212, v[116:119], s[38:39] offset:32 global_store_dwordx4 v212, v[120:123], s[38:39] offset:48 .LBB10_33: ; %Flow ; in Loop: Header=BB10_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_mov_b64 s[8:9], 0 s_branch .LBB10_4 .LBB10_34: ; in Loop: Header=BB10_5 Depth=1 v_mbcnt_hi_u32_b32 v64, -1, v225 v_and_b32_e32 v65, 63, v64 v_and_b32_e32 v67, 64, v64 v_xor_b32_e32 v66, 32, v65 v_add_u32_e32 v67, 64, v67 v_cmp_lt_i32_e32 vcc, v66, v67 v_cndmask_b32_e32 v66, v64, v66, vcc v_lshlrev_b32_e32 v66, 2, v66 ds_bpermute_b32 v66, v66, v227 v_xor_b32_e32 v65, 16, v65 v_cmp_lt_i32_e32 vcc, v65, v67 v_cndmask_b32_e32 v64, v64, v65, vcc v_lshlrev_b32_e32 v65, 2, v64 s_waitcnt lgkmcnt(0) v_add_f32_e32 v64, v66, v227 ds_bpermute_b32 v65, v65, v64 s_and_saveexec_b64 s[8:9], s[4:5] s_cbranch_execz .LBB10_36 ; %bb.35: ; in Loop: Header=BB10_5 Depth=1 s_waitcnt lgkmcnt(0) v_add_f32_e32 v64, v64, v65 ds_write_b32 v194, v64 .LBB10_36: ; in Loop: Header=BB10_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB10_38 ; %bb.37: ; in Loop: Header=BB10_5 Depth=1 ds_read_b64 v[64:65], v195 s_waitcnt lgkmcnt(0) v_add_f32_e32 v64, v65, v64 ds_write_b32 v205, v64 offset:256 .LBB10_38: ; %_ZN5flash7SoftmaxILi1EE29normalize_softmax_lse_fp8_tp4ILb0ELb0ELb1EN4cute6TensorINS3_10ViewEngineINS3_8smem_ptrIfEEEENS3_6LayoutINS3_5tupleIJNS3_1CILi128EEEEEENSA_IJNSB_ILi1EEEEEEEEEEEENS4_INS3_13array_alignedIfLm1ELm16EEENS9_ISF_NSA_IJNSB_ILi0EEEEEEEEEEPDv4_fRT2_fff.exit.i ; in Loop: Header=BB10_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v64, v205 offset:256 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, 0, v64 s_and_saveexec_b64 s[38:39], s[54:55] s_cbranch_execz .LBB10_40 ; %bb.39: ; in Loop: Header=BB10_5 Depth=1 s_mul_i32 s8, s22, s29 v_log_f32_e32 v65, v64 s_add_i32 s8, s8, s7 s_mul_i32 s8, s8, s35 s_add_i32 s8, s8, s6 s_ashr_i32 s9, s8, 31 v_mul_f32_e32 v65, 0x3f317218, v65 v_fmac_f32_e32 v65, v228, v167 v_mov_b32_e32 v66, 0x7f800000 s_lshl_b64 s[8:9], s[8:9], 2 v_cndmask_b32_e32 v67, v65, v66, vcc v_mov_b32_e32 v66, s9 v_add_co_u32_e64 v65, s[8:9], s8, v199 v_addc_co_u32_e64 v66, s[8:9], v200, v66, s[8:9] global_store_dword v[65:66], v67, off .LBB10_40: ; %.loopexit.i ; in Loop: Header=BB10_5 Depth=1 s_or_b64 exec, exec, s[38:39] s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execz .LBB10_3 ; %bb.41: ; %.preheader.i ; in Loop: Header=BB10_5 Depth=1 s_mul_i32 s38, s22, s25 s_mul_hi_u32 s39, s22, s24 s_add_i32 s38, s39, s38 s_mul_i32 s23, s23, s24 v_rcp_f32_e32 v64, v64 s_add_i32 s23, s38, s23 s_mul_i32 s38, s22, s24 s_add_u32 s38, s65, s38 s_addc_u32 s39, s66, s23 s_lshl_b64 s[38:39], s[38:39], 1 v_mul_f32_e32 v64, s34, v64 s_add_u32 s23, s20, s38 v_cndmask_b32_e64 v64, v64, 1.0, vcc s_addc_u32 s38, s21, s39 v_mul_f32_e32 v60, v64, v60 v_mul_f32_e32 v61, v64, v61 v_mul_f32_e32 v56, v64, v56 v_mul_f32_e32 v57, v64, v57 v_mul_f32_e32 v52, v64, v52 v_mul_f32_e32 v53, v64, v53 v_mul_f32_e32 v48, v64, v48 v_mul_f32_e32 v49, v64, v49 v_mul_f32_e32 v65, v64, v16 v_mul_f32_e32 v81, v64, v2 v_mov_b32_e32 v2, s38 v_add_co_u32_e32 v16, vcc, s23, v165 v_mul_f32_e32 v62, v64, v62 v_mul_f32_e32 v63, v64, v63 v_mul_f32_e32 v58, v64, v58 v_mul_f32_e32 v59, v64, v59 v_mul_f32_e32 v54, v64, v54 v_mul_f32_e32 v55, v64, v55 v_mul_f32_e32 v50, v64, v50 v_mul_f32_e32 v51, v64, v51 v_mul_f32_e32 v44, v64, v44 v_mul_f32_e32 v45, v64, v45 v_mul_f32_e32 v46, v64, v46 v_mul_f32_e32 v47, v64, v47 v_mul_f32_e32 v40, v64, v40 v_mul_f32_e32 v41, v64, v41 v_mul_f32_e32 v42, v64, v42 v_mul_f32_e32 v43, v64, v43 v_mul_f32_e32 v36, v64, v36 v_mul_f32_e32 v37, v64, v37 v_mul_f32_e32 v38, v64, v38 v_mul_f32_e32 v39, v64, v39 v_mul_f32_e32 v32, v64, v32 v_mul_f32_e32 v33, v64, v33 v_mul_f32_e32 v34, v64, v34 v_mul_f32_e32 v35, v64, v35 v_mul_f32_e32 v28, v64, v28 v_mul_f32_e32 v29, v64, v29 v_mul_f32_e32 v30, v64, v30 v_mul_f32_e32 v31, v64, v31 v_mul_f32_e32 v24, v64, v24 v_mul_f32_e32 v25, v64, v25 v_mul_f32_e32 v26, v64, v26 v_mul_f32_e32 v27, v64, v27 v_mul_f32_e32 v20, v64, v20 v_mul_f32_e32 v21, v64, v21 v_mul_f32_e32 v22, v64, v22 v_mul_f32_e32 v23, v64, v23 v_mul_f32_e32 v66, v64, v17 v_mul_f32_e32 v18, v64, v18 v_mul_f32_e32 v19, v64, v19 v_mul_f32_e32 v67, v64, v12 v_mul_f32_e32 v68, v64, v13 v_mul_f32_e32 v69, v64, v14 v_mul_f32_e32 v70, v64, v15 v_mul_f32_e32 v71, v64, v8 v_mul_f32_e32 v72, v64, v9 v_mul_f32_e32 v73, v64, v10 v_mul_f32_e32 v74, v64, v11 v_mul_f32_e32 v75, v64, v4 v_mul_f32_e32 v76, v64, v5 v_mul_f32_e32 v77, v64, v6 v_mul_f32_e32 v78, v64, v7 v_mul_f32_e32 v79, v64, v0 v_mul_f32_e32 v80, v64, v1 v_mul_f32_e32 v64, v64, v3 v_cvt_pk_bf16_f32 v0, v60, v56 v_cvt_pk_bf16_f32 v1, v52, v48 v_addc_co_u32_e32 v17, vcc, v2, v166, vcc v_cvt_pk_bf16_f32 v2, v61, v57 v_cvt_pk_bf16_f32 v3, v53, v49 v_cvt_pk_bf16_f32 v4, v62, v58 v_cvt_pk_bf16_f32 v5, v54, v50 v_cvt_pk_bf16_f32 v6, v63, v59 v_cvt_pk_bf16_f32 v7, v55, v51 v_cvt_pk_bf16_f32 v8, v44, v40 v_cvt_pk_bf16_f32 v9, v36, v32 v_cvt_pk_bf16_f32 v10, v45, v41 v_cvt_pk_bf16_f32 v11, v37, v33 v_cvt_pk_bf16_f32 v12, v46, v42 v_cvt_pk_bf16_f32 v13, v38, v34 v_cvt_pk_bf16_f32 v14, v47, v43 v_cvt_pk_bf16_f32 v15, v39, v35 global_store_dwordx4 v[16:17], v[0:3], off global_store_dwordx4 v[16:17], v[4:7], off offset:16 global_store_dwordx4 v[16:17], v[8:11], off offset:256 global_store_dwordx4 v[16:17], v[12:15], off offset:272 v_cvt_pk_bf16_f32 v0, v28, v24 v_cvt_pk_bf16_f32 v1, v20, v65 v_cvt_pk_bf16_f32 v2, v29, v25 v_cvt_pk_bf16_f32 v3, v21, v66 v_cvt_pk_bf16_f32 v4, v30, v26 v_cvt_pk_bf16_f32 v5, v22, v18 v_cvt_pk_bf16_f32 v6, v31, v27 v_cvt_pk_bf16_f32 v7, v23, v19 v_cvt_pk_bf16_f32 v8, v67, v71 v_cvt_pk_bf16_f32 v9, v75, v79 v_cvt_pk_bf16_f32 v10, v68, v72 v_cvt_pk_bf16_f32 v11, v76, v80 v_cvt_pk_bf16_f32 v12, v69, v73 v_cvt_pk_bf16_f32 v13, v77, v81 v_cvt_pk_bf16_f32 v14, v70, v74 v_cvt_pk_bf16_f32 v15, v78, v64 global_store_dwordx4 v[16:17], v[0:3], off offset:512 global_store_dwordx4 v[16:17], v[4:7], off offset:528 global_store_dwordx4 v[16:17], v[8:11], off offset:768 global_store_dwordx4 v[16:17], v[12:15], off offset:784 s_branch .LBB10_3 .LBB10_42: ; %.loopexit s_endpgm .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params .amdhsa_group_segment_fixed_size 0 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 336 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 1 .amdhsa_system_sgpr_workgroup_id_z 1 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 250 .amdhsa_next_free_sgpr 73 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_bf16_overflow 0 .amdhsa_bf8_overflow 1 .amdhsa_fp8_overflow 1 .amdhsa_bf16_denorm_mode 3 .amdhsa_load_store_out_of_order 0 .amdhsa_matrix_excp 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params,#alloc,#execinstr .Lfunc_end10: .size _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params, .Lfunc_end10-_ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 8640 ; NumSgprs: 77 ; NumVgprs: 250 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 0 bytes/workgroup (compile time only) ; SGPRBlocks: 9 ; VGPRBlocks: 62 ; NumSGPRsForWavesPerEU: 77 ; NumVGPRsForWavesPerEU: 250 ; Occupancy: 1 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF16_OVFL: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:FP8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:BF16_DENORM: 3 .section .text._ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params,#alloc,#execinstr .protected _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params ; -- Begin function _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params .globl _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params .p2align 8 .type _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params,@function _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params: ; @_ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params ; %bb.0: s_load_dwordx2 s[0:1], s[4:5], 0x110 s_load_dwordx2 s[34:35], s[4:5], 0x0 s_lshl_b32 s2, s8, 3 s_ashr_i32 s3, s2, 31 s_lshl_b64 s[2:3], s[2:3], 2 s_waitcnt lgkmcnt(0) s_add_u32 s24, s0, s2 s_addc_u32 s25, s1, s3 s_load_dwordx4 s[12:15], s[24:25], 0x0 s_waitcnt lgkmcnt(0) s_cmp_ge_i32 s12, s34 s_cbranch_scc1 .LBB11_51 ; %bb.1: s_cmp_gt_i32 s12, s14 s_cbranch_scc1 .LBB11_51 ; %bb.2: ; %.lr.ph s_load_dwordx4 s[16:19], s[4:5], 0x58 s_load_dwordx4 s[0:3], s[4:5], 0x140 s_load_dwordx2 s[52:53], s[4:5], 0xb0 s_load_dwordx4 s[36:39], s[4:5], 0xc0 s_load_dwordx2 s[28:29], s[4:5], 0xe0 s_load_dwordx4 s[8:11], s[4:5], 0xf0 s_load_dwordx2 s[54:55], s[4:5], 0x130 s_waitcnt lgkmcnt(0) s_load_dword s22, s[0:1], 0x0 s_load_dword s33, s[2:3], 0x0 s_load_dwordx4 s[40:43], s[4:5], 0xc s_load_dwordx2 s[20:21], s[4:5], 0x20 s_load_dword s34, s[24:25], 0x10 s_ashr_i32 s0, s13, 31 s_load_dwordx4 s[24:27], s[4:5], 0x90 s_load_dwordx2 s[60:61], s[4:5], 0x100 s_lshr_b32 s0, s0, 26 s_lshl_b32 s6, s6, 4 s_add_i32 s0, s13, s0 s_ashr_i32 s2, s6, 31 s_ashr_i32 s3, s7, 31 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v1, s22 s_ashr_i32 s13, s0, 6 s_mul_i32 s0, s26, s2 s_mul_hi_u32 s1, s26, s6 s_mul_hi_u32 s22, s38, s7 s_mul_i32 s23, s38, s3 s_add_i32 s0, s1, s0 s_mul_i32 s1, s27, s6 s_add_i32 s22, s22, s23 s_mul_i32 s23, s39, s7 s_add_i32 s0, s0, s1 s_mul_i32 s1, s26, s6 s_add_i32 s22, s22, s23 s_mul_i32 s23, s38, s7 s_add_u32 s64, s23, s1 s_addc_u32 s65, s22, s0 s_ashr_i32 s0, s42, 31 ; implicit-def: $vgpr158 : SGPR spill to VGPR lane v_mul_f32_e32 v1, s33, v1 v_writelane_b32 v158, s40, 0 s_add_i32 s1, s42, s0 s_xor_b32 s1, s1, s0 v_cvt_f32_u32_e32 v2, s1 v_mul_f32_e32 v106, s20, v1 v_mul_f32_e32 v107, s21, v1 s_sub_i32 s21, 0, s1 v_rcp_iflag_f32_e32 v1, v2 s_add_i32 s20, s7, s3 s_xor_b32 s20, s20, s3 s_xor_b32 s0, s3, s0 v_mul_f32_e32 v1, 0x4f7ffffe, v1 v_cvt_u32_f32_e32 v1, v1 v_and_b32_e32 v109, 15, v0 v_lshrrev_b32_e32 v3, 3, v0 v_add_lshl_u32 v3, v3, v0, 4 v_readfirstlane_b32 s22, v1 s_mul_i32 s21, s21, s22 s_mul_hi_u32 s21, s22, s21 s_add_i32 s22, s22, s21 s_mul_hi_u32 s21, s20, s22 s_mul_i32 s22, s21, s1 s_sub_i32 s20, s20, s22 s_add_i32 s22, s21, 1 s_sub_i32 s23, s20, s1 s_cmp_ge_u32 s20, s1 s_cselect_b32 s21, s22, s21 s_cselect_b32 s20, s23, s20 s_add_i32 s22, s21, 1 s_cmp_ge_u32 s20, s1 s_cselect_b32 s1, s22, s21 s_xor_b32 s1, s1, s0 s_sub_i32 s20, s1, s0 s_ashr_i32 s0, s20, 31 s_mul_i32 s0, s28, s0 s_mul_hi_u32 s1, s28, s20 s_add_i32 s21, s1, s0 s_load_dwordx2 s[0:1], s[4:5], 0x48 v_mul_lo_u32 v1, v109, s26 s_mul_i32 s22, s29, s20 s_add_i32 s21, s21, s22 s_mul_i32 s20, s28, s20 s_load_dwordx4 s[28:31], s[4:5], 0x28 v_lshrrev_b32_e32 v108, 6, v0 s_waitcnt lgkmcnt(0) s_add_u32 s67, s0, s20 v_add_u32_e32 v111, 0x200, v1 v_lshlrev_b32_e32 v1, 4, v0 v_and_b32_e32 v3, 48, v3 s_movk_i32 s20, 0x3c0 v_lshlrev_b32_e32 v4, 4, v109 s_addc_u32 s68, s1, s21 v_lshrrev_b32_e32 v2, 4, v0 v_and_or_b32 v113, v1, s20, v3 v_lshl_or_b32 v5, v108, 2, v4 s_add_i32 s20, 0, 0x7600 v_or_b32_e32 v2, v1, v2 v_and_b32_e32 v112, 0x3f0, v1 v_lshrrev_b32_e32 v1, 2, v0 v_add_u32_e32 v115, s20, v5 v_add_u32_e32 v116, s20, v4 s_add_i32 s20, 0, 0x7400 v_and_b32_e32 v1, 12, v1 v_add_u32_e32 v123, s20, v5 v_add_u32_e32 v124, s20, v4 s_mul_i32 s2, s36, s2 s_mul_hi_u32 s20, s36, s6 v_lshl_or_b32 v114, v108, 4, v1 v_mul_i32_i24_e32 v6, -3, v109 v_lshl_or_b32 v1, v108, 5, v1 s_add_i32 s2, s20, s2 s_mul_i32 s20, s37, s6 v_writelane_b32 v158, s41, 1 v_or_b32_e32 v3, s6, v109 v_lshlrev_b32_e32 v6, 2, v6 v_cmp_eq_u32_e32 vcc, 0, v1 v_mov_b32_e32 v1, 0 s_add_i32 s20, s2, s20 s_mul_hi_u32 s2, s8, s7 s_mul_i32 s3, s8, s3 v_writelane_b32 v158, s42, 2 v_add_u32_e32 v117, v116, v6 v_and_b32_e32 v118, 0x3fc, v2 v_xad_u32 v119, v3, -1, s35 v_add_u32_e32 v125, v124, v6 v_and_b32_e32 v2, 0xf0, v0 s_add_i32 s2, s2, s3 s_mul_i32 s3, s9, s7 v_mov_b32_e32 v3, v1 v_or_b32_e32 v5, 15, v0 v_mov_b32_e32 v6, v1 v_writelane_b32 v158, s43, 3 s_sub_i32 s0, 16, s35 s_add_i32 s9, s2, s3 v_mad_u64_u32 v[3:4], s[2:3], s36, v109, v[2:3] v_mul_lo_u32 v10, s37, v109 v_mad_u64_u32 v[5:6], s[2:3], s36, v109, v[5:6] s_load_dwordx4 s[40:43], s[4:5], 0x120 s_max_i32 s0, s0, 0 s_lshl_b32 s0, s0, 8 s_or_b32 s27, s0, 0x10000 s_sub_i32 s0, s35, s6 v_cmp_gt_i32_e64 s[0:1], s0, v109 v_add_u32_e32 v4, v10, v4 v_add_u32_e32 v6, v10, v6 v_lshlrev_b32_e32 v10, 2, v109 s_load_dwordx2 s[56:57], s[4:5], 0x80 s_and_b64 s[58:59], vcc, s[0:1] s_waitcnt lgkmcnt(0) v_mov_b32_e32 v11, s43 v_add_co_u32_e32 v126, vcc, s42, v10 s_load_dwordx2 s[42:43], s[4:5], 0x68 v_lshlrev_b32_e32 v7, 9, v109 s_mul_i32 s21, s36, s6 s_mul_i32 s8, s8, s7 v_addc_co_u32_e32 v127, vcc, 0, v11, vcc v_and_b32_e32 v110, 48, v0 v_or_b32_e32 v8, v7, v2 v_or_b32_e32 v9, 0x100, v2 s_add_u32 s69, s8, s21 v_mov_b32_e32 v11, s19 v_add_co_u32_e32 v128, vcc, s18, v10 v_lshlrev_b64 v[102:103], 1, v[3:4] v_lshlrev_b64 v[104:105], 1, v[5:6] v_mbcnt_lo_u32_b32 v133, -1, 0 s_mov_b32 s95, s24 s_mov_b32 s66, s25 s_mov_b32 s23, 0x10000 v_or_b32_e32 v101, 0x200, v110 s_mov_b32 s22, s52 v_or_b32_e32 v120, 1, v114 v_or_b32_e32 v121, 2, v114 v_or_b32_e32 v122, 3, v114 s_addc_u32 s70, s9, s20 v_addc_co_u32_e32 v129, vcc, 0, v11, vcc s_brev_b32 s46, 1 s_mov_b32 s47, 0x20000 s_add_i32 s71, 0, 0x4400 s_add_i32 s72, 0, 0x4800 s_add_i32 s73, 0, 0x4c00 s_mov_b32 s74, 0xff800000 s_add_i32 s75, 0, 0x400 v_lshlrev_b32_e32 v130, 2, v8 v_add_lshl_u32 v131, v7, v2, 2 v_add_lshl_u32 v132, v7, v9, 2 v_mbcnt_hi_u32_b32 v134, -1, v133 s_add_i32 s76, 0, 0x800 s_add_i32 s77, 0, 0xc00 v_cmp_eq_u32_e64 s[2:3], 0, v110 v_cmp_gt_u32_e64 s[4:5], 16, v0 s_mov_b32 s18, s12 s_mov_b32 s48, 0 s_branch .LBB11_5 .LBB11_3: ; %Flow256 ; in Loop: Header=BB11_5 Depth=1 s_or_b64 exec, exec, s[8:9] .LBB11_4: ; %_ZN5flash45compute_attn_1rowblock_splitkv_mla_fp8_gfx938I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_22SharedStorageMLAFloat8IS5_EEEEvRK20Flash_fwd_mla_paramsiiiiiiibRT1_fff.exit ; in Loop: Header=BB11_5 Depth=1 s_add_i32 s8, s18, 1 s_cmp_lt_i32 s18, s14 s_mov_b32 s18, s8 s_cbranch_scc0 .LBB11_51 .LBB11_5: ; =>This Loop Header: Depth=1 ; Child Loop BB11_28 Depth 2 s_ashr_i32 s19, s18, 31 s_lshl_b64 s[62:63], s[18:19], 2 s_add_u32 s8, s28, s62 s_addc_u32 s9, s29, s63 global_load_dword v0, v1, s[8:9] s_cmp_le_i32 s18, s12 s_waitcnt vmcnt(0) v_readfirstlane_b32 s78, v0 s_cbranch_scc1 .LBB11_7 ; %bb.6: ; in Loop: Header=BB11_5 Depth=1 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) .LBB11_7: ; in Loop: Header=BB11_5 Depth=1 s_cmp_eq_u32 s18, s12 s_cselect_b64 s[8:9], -1, 0 s_and_b64 s[20:21], s[8:9], exec s_cselect_b32 s80, s13, 0 s_cmp_eq_u32 s18, s14 s_cselect_b32 s20, s15, s78 s_add_i32 s20, s20, 63 s_ashr_i32 s21, s20, 31 s_lshr_b32 s21, s21, 26 s_add_i32 s20, s20, s21 s_ashr_i32 s79, s20, 6 s_waitcnt lgkmcnt(0) s_mul_i32 s20, s18, s43 s_mul_hi_u32 s21, s18, s42 s_add_i32 s20, s21, s20 s_mul_i32 s21, s19, s42 s_add_i32 s20, s20, s21 s_mul_i32 s21, s18, s42 s_add_u32 s21, s64, s21 s_addc_u32 s20, s65, s20 v_readfirstlane_b32 s36, v108 s_add_u32 s44, s30, s21 s_addc_u32 s45, s31, s20 s_lshl_b32 s20, s36, 6 s_ashr_i32 s21, s20, 31 s_add_u32 s24, s44, s20 s_addc_u32 s25, s45, s21 s_lshl_b32 s21, s36, 10 s_add_i32 s81, s21, 0 s_cmp_lg_u32 s81, -1 s_cselect_b32 s82, s81, 0 s_bitset1_b32 s82, 31 s_nop 0 matrix_load_64x16_b8 s[24:27] s82 t r lds s_add_i32 s83, s82, 0x1000 s_nop 0 matrix_load_64x16_b8 s[24:27] s83 moffset:256 t r lds v_or_b32_e32 v0, s20, v110 v_cmp_gt_i32_e32 vcc, 64, v0 v_add_u32_e32 v2, v111, v0 s_and_b64 vcc, vcc, s[0:1] v_cndmask_b32_e32 v0, -1, v2, vcc s_add_i32 s20, s81, 0x2000 ;;#ASMSTART s_mov_b32 m0, s20 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND s_mov_b32 m0, 0 s_nop 0 ds_read_matrix_trans_format v[5:8], m0 element:1 row:3 col:1 ds_read_matrix_trans_format v[9:12], m0 offset:1024 element:1 row:3 col:1 ds_read_matrix_trans_format v[13:16], m0 offset:2048 element:1 row:3 col:1 ds_read_matrix_trans_format v[17:20], m0 offset:3072 element:1 row:3 col:1 ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[21:24], m0 offset:4096 element:1 row:3 col:1 ds_read_matrix_trans_format v[25:28], m0 offset:5120 element:1 row:3 col:1 ds_read_matrix_trans_format v[29:32], m0 offset:6144 element:1 row:3 col:1 ds_read_matrix_trans_format v[33:36], m0 offset:7168 element:1 row:3 col:1 ;;#ASMSTART s_waitcnt vmcnt(0) s_barrier ;;#ASMEND s_mul_i32 s20, s18, s61 s_mul_hi_u32 s21, s18, s60 v_add_u32_e32 v0, 0, v112 s_add_i32 s20, s21, s20 s_mul_i32 s21, s19, s60 ds_read_b128 v[37:40], v0 offset:8192 s_add_i32 s21, s20, s21 s_mul_i32 s20, s18, s60 s_lshl_b32 s86, s36, 4 s_lshl_b64 s[20:21], s[20:21], 2 s_add_u32 s85, s10, s20 s_addc_u32 s84, s11, s21 s_add_i32 s24, s79, -1 v_lshl_or_b32 v97, s36, 12, v113 s_cmp_le_i32 s79, s80 v_mov_b32_e32 v0, 0 s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_cbranch_scc1 .LBB11_26 ; %bb.8: ; %.lr.ph.i ; in Loop: Header=BB11_5 Depth=1 s_ashr_i32 s25, s24, 31 s_ashr_i32 s36, s86, 31 s_lshl_b64 s[20:21], s[24:25], 2 s_add_u32 s20, s85, s20 s_addc_u32 s21, s84, s21 ;;#ASMSTART s_load_dword s25, s[20:21], 0x0 s_waitcnt lgkmcnt(0) ;;#ASMEND s_ashr_i32 s20, s25, 31 s_mul_i32 s21, s25, s57 s_mul_hi_u32 s37, s25, s56 s_add_i32 s21, s37, s21 s_mul_i32 s20, s20, s56 s_add_i32 s21, s21, s20 s_mul_i32 s25, s25, s56 s_add_u32 s44, s67, s25 s_addc_u32 s45, s68, s21 s_mul_i32 s20, s86, s53 s_mul_hi_u32 s21, s86, s52 s_add_i32 s25, s21, s20 s_mul_i32 s36, s36, s52 s_lshl_b32 s49, s24, 6 s_add_i32 s25, s25, s36 s_add_i32 s50, s86, s49 s_cmp_lt_i32 s50, s78 s_mov_b64 s[36:37], -1 s_cbranch_scc1 .LBB11_10 ; %bb.9: ; in Loop: Header=BB11_5 Depth=1 v_readfirstlane_b32 s20, v108 s_lshl_b32 s20, s20, 10 s_add_i32 s20, s20, 0 v_mov_b32_e32 v0, -1 s_nop 1 ;;#ASMSTART s_mov_b32 m0, s20 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s21, s20, 0x1000 ;;#ASMSTART s_mov_b32 m0, s21 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s21, s20, 0x2000 ;;#ASMSTART s_mov_b32 m0, s21 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s21, s20, 0x3000 ;;#ASMSTART s_mov_b32 m0, s21 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s21, s20, 0x4000 ;;#ASMSTART s_mov_b32 m0, s21 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s21, s20, 0x5000 ;;#ASMSTART s_mov_b32 m0, s21 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s21, s20, 0x6000 ;;#ASMSTART s_mov_b32 m0, s21 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_addk_i32 s20, 0x7000 ;;#ASMSTART s_mov_b32 m0, s20 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_mov_b64 s[36:37], 0 .LBB11_10: ; %Flow263 ; in Loop: Header=BB11_5 Depth=1 s_sub_i32 s94, 0, s78 s_add_i32 s89, s82, 0x2000 s_add_i32 s90, s82, 0x3000 s_add_i32 s91, s82, 0x4000 s_add_i32 s92, s82, 0x5000 s_add_i32 s87, s82, 0x6000 s_add_i32 s88, s82, 0x7000 s_andn2_b64 vcc, exec, s[36:37] s_mul_i32 s93, s86, s52 s_cbranch_vccnz .LBB11_12 ; %bb.11: ; in Loop: Header=BB11_5 Depth=1 s_add_i32 s20, s94, s50 s_add_i32 s20, s20, 16 s_max_i32 s20, s20, 0 s_add_u32 s36, s44, s93 s_addc_u32 s37, s45, s25 s_lshl_b32 s20, s20, 8 s_or_b32 s39, s20, 0x10000 s_mov_b32 s38, s22 s_nop 0 matrix_load_64x16_b8 s[36:39] s82 t r lds matrix_load_64x16_b8 s[36:39] s83 moffset:64 t r lds matrix_load_64x16_b8 s[36:39] s89 moffset:128 t r lds matrix_load_64x16_b8 s[36:39] s90 moffset:192 t r lds matrix_load_64x16_b8 s[36:39] s91 moffset:256 t r lds matrix_load_64x16_b8 s[36:39] s92 moffset:320 t r lds matrix_load_64x16_b8 s[36:39] s87 moffset:384 t r lds matrix_load_64x16_b8 s[36:39] s88 moffset:448 t r lds .LBB11_12: ; in Loop: Header=BB11_5 Depth=1 v_readfirstlane_b32 s20, v108 v_lshl_or_b32 v0, s20, 4, v109 v_mad_u64_u32 v[2:3], s[20:21], v0, s52, v[101:102] s_sub_i32 s20, s78, s49 v_cmp_gt_i32_e32 vcc, s20, v0 v_cndmask_b32_e32 v0, -1, v2, vcc buffer_load_dwordx4 v[49:52], v0, s[44:47], 0 offen s_add_i32 s20, 0, 0x4000 v_add_u32_e32 v76, s20, v97 v_add_u32_e32 v74, s71, v97 v_add_u32_e32 v82, s72, v97 v_add_u32_e32 v81, s73, v97 ;;#ASMSTART s_waitcnt vmcnt(8) s_barrier ;;#ASMEND s_mov_b32 m0, s81 s_nop 0 ds_read_matrix_trans_format v[41:44], m0 element:1 row:3 col:1 v_mov_b32_e32 v3, v1 v_mov_b32_e32 v4, v1 v_mov_b32_e32 v2, v1 v_mov_b64_e32 v[55:56], v[3:4] v_mov_b64_e32 v[53:54], v[1:2] s_waitcnt lgkmcnt(0) s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[5:6], v[41:42], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[7:8], v[43:44], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(7) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[41:44], m0 offset:4096 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[9:10], v[41:42], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[11:12], v[43:44], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(6) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[41:44], m0 offset:8192 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[13:14], v[41:42], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[15:16], v[43:44], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(5) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[41:44], m0 offset:12288 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[17:18], v[41:42], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[19:20], v[43:44], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[41:44], m0 offset:16384 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[21:22], v[41:42], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[23:24], v[43:44], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[41:44], m0 offset:20480 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[25:26], v[41:42], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[27:28], v[43:44], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[41:44], m0 offset:24576 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[29:30], v[41:42], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[31:32], v[43:44], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[65:68], m0 offset:28672 element:1 row:3 col:1 ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[41:43:45:47], v76 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[42:44:46:48], v74 ds_read_m64x16_b8_alt4 v[58:60:62:64], v82 ds_read_m64x16_b8_alt4 v[71:73:75:77], v81 ; sched_barrier mask(0x00000000) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[33:34], v[65:66], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[35:36], v[67:68], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(0) ;;#ASMEND s_waitcnt vmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[37:38], v[49:50], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[39:40], v[51:52], v[53:56] lit ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND v_readlane_b32 s36, v158, 0 v_readlane_b32 s39, v158, 3 s_ashr_i32 s20, s39, 31 s_add_i32 s21, s20, s39 s_xor_b32 s21, s21, s20 v_cvt_f32_u32_e32 v0, s21 v_ashrrev_i32_e32 v3, 31, v119 v_add_u32_e32 v4, v3, v119 v_xor_b32_e32 v4, v4, v3 v_rcp_iflag_f32_e32 v0, v0 s_not_b32 s36, s49 s_add_i32 s36, s36, s78 v_mov_b32_e32 v50, 0xff800000 v_mul_f32_e32 v0, 0x4f7ffffe, v0 v_cvt_u32_f32_e32 v0, v0 v_and_b32_e32 v51, 63, v134 v_xor_b32_e32 v52, 32, v51 v_xor_b32_e32 v51, 16, v51 v_mul_lo_u32 v2, s21, v0 v_readlane_b32 s37, v158, 1 v_readlane_b32 s38, v158, 2 v_sub_u32_e32 v2, 0, v2 v_mul_hi_u32 v2, v2, v0 v_add_u32_e32 v0, v2, v0 v_mul_hi_u32 v0, v0, v4 v_xor_b32_e32 v2, s20, v3 v_mul_lo_u32 v3, v0, s21 v_add_u32_e32 v49, 1, v0 v_sub_u32_e32 v3, v4, v3 v_cmp_le_u32_e32 vcc, s21, v3 v_subrev_u32_e32 v4, s21, v3 v_cndmask_b32_e32 v0, v0, v49, vcc v_cndmask_b32_e32 v3, v3, v4, vcc v_add_u32_e32 v4, 1, v0 v_cmp_le_u32_e32 vcc, s21, v3 v_cndmask_b32_e32 v0, v0, v4, vcc v_xor_b32_e32 v0, v0, v2 v_sub_u32_e32 v2, v0, v2 v_sub_u32_e32 v0, s36, v2 v_cmp_le_i32_e32 vcc, v114, v0 v_cndmask_b32_e32 v49, v50, v53, vcc v_cmp_le_i32_e32 vcc, v120, v0 v_cndmask_b32_e32 v4, v50, v54, vcc v_cmp_le_i32_e32 vcc, v121, v0 v_and_b32_e32 v53, 64, v134 v_cndmask_b32_e32 v3, v50, v55, vcc v_cmp_le_i32_e32 vcc, v122, v0 v_add_u32_e32 v53, 64, v53 v_cndmask_b32_e32 v0, v50, v56, vcc v_cmp_lt_i32_e32 vcc, v52, v53 v_max_f32_e32 v50, v49, v4 v_cndmask_b32_e32 v52, v134, v52, vcc v_max3_f32 v50, v50, v3, v0 v_lshlrev_b32_e32 v83, 2, v52 ds_bpermute_b32 v52, v83, v50 v_cmp_lt_i32_e32 vcc, v51, v53 v_cndmask_b32_e32 v51, v134, v51, vcc v_lshlrev_b32_e32 v84, 2, v51 s_waitcnt lgkmcnt(0) v_max_f32_e32 v50, v50, v52 ds_bpermute_b32 v51, v84, v50 s_and_saveexec_b64 s[36:37], s[2:3] s_cbranch_execz .LBB11_14 ; %bb.13: ; in Loop: Header=BB11_5 Depth=1 s_waitcnt lgkmcnt(0) v_max_f32_e32 v50, v50, v51 ds_write_b32 v115, v50 .LBB11_14: ; in Loop: Header=BB11_5 Depth=1 s_or_b64 exec, exec, s[36:37] v_mov_b32_e32 v66, v64 v_mov_b32_e32 v57, v47 v_mov_b32_e32 v70, v48 v_mov_b32_e32 v65, v62 v_mov_b32_e32 v64, v60 v_mov_b32_e32 v63, v58 v_mov_b32_e32 v59, v71 v_mov_b32_e32 v56, v45 v_mov_b32_e32 v55, v43 v_mov_b32_e32 v54, v41 v_mov_b32_e32 v69, v46 v_mov_b32_e32 v68, v44 v_mov_b32_e32 v67, v42 v_mov_b32_e32 v60, v73 v_mov_b32_e32 v61, v75 v_mov_b32_e32 v62, v77 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[36:37], s[4:5] s_cbranch_execz .LBB11_16 ; %bb.15: ; in Loop: Header=BB11_5 Depth=1 ds_read_b128 v[41:44], v116 s_waitcnt lgkmcnt(0) v_max_f32_e32 v43, v43, v44 v_max3_f32 v41, v41, v42, v43 ds_write_b32 v117, v41 offset:256 .LBB11_16: ; %_ZN5flash7SoftmaxILi1EE21softmax_rescale_o_fp8ILb1ELb1EN4cute6TensorINS3_13array_alignedIfLm4ELm16EEENS3_6LayoutINS3_5tupleIJNS3_1CILi4EEENS9_ILi1EEESB_EEENS8_IJSB_NS9_ILi0EEESD_EEEEEEENS4_INS3_10ViewEngineINS3_8smem_ptrIfEEEENS7_INS8_IJNS9_ILi128EEEEEENS8_IJSB_EEEEEEEEEvRT1_RT2_fPDv4_f.exit.peel.i ; in Loop: Header=BB11_5 Depth=1 s_or_b64 exec, exec, s[36:37] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v135, v117 offset:256 v_add_u32_e32 v100, 0, v97 v_add_u32_e32 v136, s75, v97 v_add_u32_e32 v99, s76, v97 v_add_u32_e32 v98, s77, v97 s_waitcnt lgkmcnt(0) v_mul_f32_e32 v41, v135, v107 v_cmp_lg_f32_e32 vcc, s74, v135 v_cndmask_b32_e32 v41, 0, v41, vcc v_fma_f32 v42, v49, v107, -v41 v_fma_f32 v4, v4, v107, -v41 v_exp_f32_e32 v42, v42 v_exp_f32_e32 v4, v4 v_fma_f32 v3, v3, v107, -v41 v_fma_f32 v0, v0, v107, -v41 v_exp_f32_e32 v137, v3 v_exp_f32_e32 v0, v0 v_add_f32_e32 v3, v4, v42 v_cvt_pk_fp8_f32 v42, v42, v4, s0 v_add_f32_e32 v3, v3, v137 v_add_u32_e32 v4, 0, v118 v_cvt_pk_fp8_f32 v137, v137, v0, v42 op_sel:[0,0,0,1] v_add_f32_e32 v0, v3, v0 ds_write_b32 v4, v137 offset:28672 v_add_u32_e32 v4, 0, v112 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b128 v[77:80], v4 offset:28672 ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[85:87:89:91], v100 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[86:88:90:92], v136 ds_read_m64x16_b8_alt4 v[138:140:142:144], v99 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[139:141:143:145], v98 ; sched_barrier mask(0x00000000) s_mov_b32 s49, s48 s_mov_b32 s50, s48 s_mov_b32 s51, s48 v_mov_b64_e32 v[45:46], s[48:49] v_mov_b64_e32 v[47:48], s[50:51] v_mov_b64_e32 v[51:52], v[47:48] v_mov_b32_e32 v4, v67 v_mov_b64_e32 v[41:42], v[45:46] v_mov_b32_e32 v67, v55 v_mov_b64_e32 v[49:50], v[45:46] v_mov_b32_e32 v3, v54 v_mov_b64_e32 v[43:44], v[47:48] v_mmac_f32_16x16x32_fp8_fp8 v[49:52], v[77:78], v[67:68], v[49:52] lit v_mov_b32_e32 v68, v56 v_mov_b64_e32 v[55:56], v[47:48] v_mmac_f32_16x16x32_fp8_fp8 v[41:44], v[77:78], v[3:4], v[41:44] lit v_mov_b32_e32 v4, v59 v_mov_b32_e32 v59, v64 v_mov_b64_e32 v[53:54], v[45:46] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[49:52], v[79:80], v[59:60], v[49:52] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[77:78], v[68:69], v[53:56] lit v_mov_b32_e32 v60, v65 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[79:80], v[60:61], v[53:56] lit v_mov_b32_e32 v69, v57 v_mov_b64_e32 v[59:60], v[47:48] v_mov_b64_e32 v[57:58], v[45:46] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[57:60], v[77:78], v[69:70], v[57:60] lit v_mov_b32_e32 v61, v66 v_mov_b32_e32 v3, v63 s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[57:60], v[79:80], v[61:62], v[57:60] lit v_mov_b64_e32 v[63:64], v[47:48] v_mmac_f32_16x16x32_fp8_fp8 v[41:44], v[79:80], v[3:4], v[41:44] lit v_mov_b32_e32 v3, v85 v_mov_b32_e32 v4, v86 v_mov_b64_e32 v[61:62], v[45:46] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[61:64], v[77:78], v[3:4], v[61:64] lit v_mov_b32_e32 v3, v138 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v4, v139 v_mov_b64_e32 v[67:68], v[47:48] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[61:64], v[79:80], v[3:4], v[61:64] lit v_mov_b32_e32 v3, v87 v_mov_b32_e32 v4, v88 v_mov_b64_e32 v[65:66], v[45:46] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[65:68], v[77:78], v[3:4], v[65:68] lit v_mov_b32_e32 v3, v140 v_mov_b32_e32 v4, v141 v_mov_b64_e32 v[71:72], v[47:48] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[65:68], v[79:80], v[3:4], v[65:68] lit v_mov_b32_e32 v3, v89 v_mov_b32_e32 v4, v90 v_mov_b64_e32 v[69:70], v[45:46] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[69:72], v[77:78], v[3:4], v[69:72] lit v_mov_b32_e32 v3, v142 v_mov_b32_e32 v4, v143 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[69:72], v[79:80], v[3:4], v[69:72] lit v_mov_b32_e32 v3, v91 v_mov_b32_e32 v4, v92 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[45:48], v[77:78], v[3:4], v[45:48] lit v_mov_b32_e32 v3, v144 v_mov_b32_e32 v4, v145 s_add_i32 s36, s79, -2 s_cmp_le_i32 s24, s80 v_mmac_f32_16x16x32_fp8_fp8 v[45:48], v[79:80], v[3:4], v[45:48] lit s_cbranch_scc1 .LBB11_22 ; %bb.17: ; %.peel.next.i ; in Loop: Header=BB11_5 Depth=1 s_ashr_i32 s37, s36, 31 s_lshl_b64 s[20:21], s[36:37], 2 s_add_u32 s20, s85, s20 s_addc_u32 s21, s84, s21 s_lshl_b32 s24, s36, 6 s_add_i32 s36, s94, s24 s_add_i32 s36, s36, s86 s_add_i32 s36, s36, 16 s_max_i32 s36, s36, 0 s_lshl_b32 s36, s36, 8 s_sub_i32 s49, s78, s24 s_not_b32 s24, s24 v_sub_u32_e32 v146, 0, v2 s_or_b32 s39, s36, 0x10000 ;;#ASMSTART s_load_dword s36, s[20:21], 0x0 s_waitcnt lgkmcnt(0) ;;#ASMEND s_ashr_i32 s20, s36, 31 s_mul_i32 s21, s36, s57 s_mul_hi_u32 s37, s36, s56 s_add_i32 s21, s37, s21 s_mul_i32 s20, s20, s56 s_add_i32 s21, s21, s20 s_mul_i32 s36, s36, s56 s_add_u32 s44, s67, s36 s_addc_u32 s45, s68, s21 s_add_u32 s36, s44, s93 s_addc_u32 s37, s45, s25 s_mov_b32 s38, s22 s_nop 0 matrix_load_64x16_b8 s[36:39] s82 t r lds matrix_load_64x16_b8 s[36:39] s83 moffset:64 t r lds v_readfirstlane_b32 s20, v108 matrix_load_64x16_b8 s[36:39] s89 moffset:128 t r lds v_lshl_or_b32 v4, s20, 4, v109 matrix_load_64x16_b8 s[36:39] s90 moffset:192 t r lds v_mad_u64_u32 v[2:3], s[20:21], v4, s52, v[101:102] matrix_load_64x16_b8 s[36:39] s91 moffset:256 t r lds matrix_load_64x16_b8 s[36:39] s92 moffset:320 t r lds matrix_load_64x16_b8 s[36:39] s87 moffset:384 t r lds v_cmp_gt_i32_e32 vcc, s49, v4 matrix_load_64x16_b8 s[36:39] s88 moffset:448 t r lds v_cndmask_b32_e32 v2, -1, v2, vcc buffer_load_dwordx4 v[85:88], v2, s[44:47], 0 offen ;;#ASMSTART s_waitcnt vmcnt(8) s_barrier ;;#ASMEND s_mov_b32 m0, s81 s_nop 0 ds_read_matrix_trans_format v[77:80], m0 element:1 row:3 col:1 v_mov_b32_e32 v4, v1 v_mov_b32_e32 v2, v1 v_mov_b32_e32 v3, v1 v_mov_b32_e32 v141, v4 v_mov_b32_e32 v140, v3 v_mov_b32_e32 v139, v2 v_mov_b32_e32 v138, v1 s_waitcnt lgkmcnt(0) s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[5:6], v[77:78], v[138:141] lit v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[7:8], v[79:80], v[138:141] lit ;;#ASMSTART s_waitcnt vmcnt(7) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:4096 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[9:10], v[77:78], v[138:141] lit v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[11:12], v[79:80], v[138:141] lit ;;#ASMSTART s_waitcnt vmcnt(6) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:8192 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[13:14], v[77:78], v[138:141] lit v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[15:16], v[79:80], v[138:141] lit ;;#ASMSTART s_waitcnt vmcnt(5) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:12288 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[17:18], v[77:78], v[138:141] lit v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[19:20], v[79:80], v[138:141] lit ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:16384 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[21:22], v[77:78], v[138:141] lit v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[23:24], v[79:80], v[138:141] lit ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:20480 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[25:26], v[77:78], v[138:141] lit v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[27:28], v[79:80], v[138:141] lit ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:24576 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[29:30], v[77:78], v[138:141] lit v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[31:32], v[79:80], v[138:141] lit ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[142:145], m0 offset:28672 element:1 row:3 col:1 ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[73:75:77:79], v76 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[74:76:78:80], v74 ds_read_m64x16_b8_alt4 v[89:91:93:95], v82 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[90:92:94:96], v81 ; sched_barrier mask(0x00000000) v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[33:34], v[142:143], v[138:141] lit v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[35:36], v[144:145], v[138:141] lit ;;#ASMSTART s_waitcnt vmcnt(0) ;;#ASMEND s_waitcnt vmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[37:38], v[85:86], v[138:141] lit v_mmac_f32_16x16x32_fp8_fp8 v[138:141], v[39:40], v[87:88], v[138:141] lit ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_add_i32 s24, s24, s78 v_add_u32_e32 v81, s24, v146 v_mov_b32_e32 v82, 0xff800000 v_cmp_le_i32_e32 vcc, v114, v81 v_cndmask_b32_e32 v2, v82, v138, vcc v_cmp_le_i32_e32 vcc, v120, v81 v_cndmask_b32_e32 v3, v82, v139, vcc v_cmp_le_i32_e32 vcc, v121, v81 v_cndmask_b32_e32 v4, v82, v140, vcc v_cmp_le_i32_e32 vcc, v122, v81 v_cndmask_b32_e32 v138, v82, v141, vcc v_max3_f32 v81, v135, v2, v3 v_max3_f32 v81, v81, v4, v138 ds_bpermute_b32 v82, v83, v81 s_waitcnt lgkmcnt(0) v_max_f32_e32 v81, v81, v82 ds_bpermute_b32 v82, v84, v81 s_and_saveexec_b64 s[24:25], s[2:3] s_cbranch_execz .LBB11_19 ; %bb.18: ; in Loop: Header=BB11_5 Depth=1 s_waitcnt lgkmcnt(0) v_max_f32_e32 v81, v81, v82 ds_write_b32 v115, v81 .LBB11_19: ; in Loop: Header=BB11_5 Depth=1 s_or_b64 exec, exec, s[24:25] v_mov_b32_e32 v88, v79 v_mov_b32_e32 v87, v77 v_mov_b32_e32 v86, v75 v_mov_b32_e32 v85, v73 v_mov_b32_e32 v80, v80 v_mov_b32_e32 v79, v78 v_mov_b32_e32 v78, v76 v_mov_b32_e32 v77, v74 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v81, v89 v_mov_b32_e32 v73, v90 v_mov_b32_e32 v82, v91 v_mov_b32_e32 v83, v93 v_mov_b32_e32 v84, v95 v_mov_b32_e32 v74, v92 v_mov_b32_e32 v75, v94 v_mov_b32_e32 v76, v96 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[24:25], s[4:5] s_cbranch_execz .LBB11_21 ; %bb.20: ; in Loop: Header=BB11_5 Depth=1 ds_read_b128 v[89:92], v116 s_waitcnt lgkmcnt(0) v_max_f32_e32 v91, v91, v92 v_max3_f32 v89, v89, v90, v91 ds_write_b32 v117, v89 offset:256 .LBB11_21: ; %_ZN5flash7SoftmaxILi1EE21softmax_rescale_o_fp8ILb0ELb1EN4cute6TensorINS3_13array_alignedIfLm4ELm16EEENS3_6LayoutINS3_5tupleIJNS3_1CILi4EEENS9_ILi1EEESB_EEENS8_IJSB_NS9_ILi0EEESD_EEEEEEENS4_INS3_10ViewEngineINS3_8smem_ptrIfEEEENS7_INS8_IJNS9_ILi128EEEEEENS8_IJSB_EEEEEEEEEvRT1_RT2_fPDv4_f.exit.i ; in Loop: Header=BB11_5 Depth=1 s_or_b64 exec, exec, s[24:25] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v95, v117 offset:256 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, s74, v95 v_cndmask_b32_e64 v89, v95, 0, vcc v_sub_f32_e32 v89, v135, v89 v_mul_f32_e32 v89, v89, v107 s_nop 0 v_exp_f32_e32 v93, v89 v_mul_f32_e32 v89, v95, v107 v_cndmask_b32_e64 v89, v89, 0, vcc v_fma_f32 v2, v2, v107, -v89 v_fma_f32 v3, v3, v107, -v89 v_exp_f32_e32 v2, v2 v_exp_f32_e32 v3, v3 v_fma_f32 v4, v4, v107, -v89 v_fma_f32 v89, v138, v107, -v89 v_exp_f32_e32 v4, v4 v_exp_f32_e32 v96, v89 v_add_f32_e32 v89, v3, v2 v_cvt_pk_fp8_f32 v2, v2, v3, v137 v_add_f32_e32 v3, v89, v4 v_mov_b32_e32 v94, v93 v_pk_mul_f32 v[61:62], v[93:94], v[61:62] v_cvt_pk_fp8_f32 v4, v4, v96, v2 op_sel:[0,0,0,1] v_add_u32_e32 v2, 0, v118 v_pk_mul_f32 v[63:64], v[93:94], v[63:64] v_pk_mul_f32 v[65:66], v[93:94], v[65:66] ds_write_b32 v2, v4 offset:28672 v_add_u32_e32 v2, 0, v112 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b128 v[89:92], v2 offset:28672 v_add_f32_e32 v4, v3, v96 v_pk_mul_f32 v[67:68], v[93:94], v[67:68] v_pk_mul_f32 v[69:70], v[93:94], v[69:70] v_pk_mul_f32 v[71:72], v[93:94], v[71:72] v_pk_mul_f32 v[45:46], v[93:94], v[45:46] v_pk_mul_f32 v[47:48], v[93:94], v[47:48] v_pk_mul_f32 v[41:42], v[93:94], v[41:42] v_pk_mul_f32 v[43:44], v[93:94], v[43:44] v_pk_mul_f32 v[49:50], v[93:94], v[49:50] v_pk_mul_f32 v[51:52], v[93:94], v[51:52] v_pk_mul_f32 v[53:54], v[93:94], v[53:54] v_pk_mul_f32 v[55:56], v[93:94], v[55:56] v_pk_mul_f32 v[57:58], v[93:94], v[57:58] v_pk_mul_f32 v[59:60], v[93:94], v[59:60] v_fmac_f32_e32 v4, v93, v0 ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[135:137:139:141], v100 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[136:138:140:142], v136 ds_read_m64x16_b8_alt4 v[143:145:147:149], v99 ds_read_m64x16_b8_alt4 v[94:96:98:100], v98 ; sched_barrier mask(0x00000000) v_mov_b32_e32 v2, v85 v_mov_b32_e32 v3, v77 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[41:44], v[89:90], v[2:3], v[41:44] lit v_mov_b32_e32 v2, v81 v_mov_b32_e32 v3, v73 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[41:44], v[91:92], v[2:3], v[41:44] lit v_mov_b32_e32 v2, v135 s_waitcnt lgkmcnt(2) v_mov_b32_e32 v3, v136 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[61:64], v[89:90], v[2:3], v[61:64] lit s_waitcnt lgkmcnt(1) v_mov_b32_e32 v2, v143 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v3, v94 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[61:64], v[91:92], v[2:3], v[61:64] lit v_mov_b32_e32 v2, v137 v_mov_b32_e32 v3, v138 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[65:68], v[89:90], v[2:3], v[65:68] lit v_mov_b32_e32 v2, v145 v_mov_b32_e32 v3, v96 v_mov_b32_e32 v77, v86 s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[65:68], v[91:92], v[2:3], v[65:68] lit v_mov_b32_e32 v2, v139 v_mov_b32_e32 v3, v140 v_mmac_f32_16x16x32_fp8_fp8 v[49:52], v[89:90], v[77:78], v[49:52] lit v_mov_b32_e32 v73, v82 v_mov_b32_e32 v78, v87 v_mmac_f32_16x16x32_fp8_fp8 v[69:72], v[89:90], v[2:3], v[69:72] lit v_mov_b32_e32 v2, v147 v_mov_b32_e32 v3, v98 v_mmac_f32_16x16x32_fp8_fp8 v[49:52], v[91:92], v[73:74], v[49:52] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[89:90], v[78:79], v[53:56] lit v_mov_b32_e32 v74, v83 v_mov_b32_e32 v79, v88 v_mmac_f32_16x16x32_fp8_fp8 v[69:72], v[91:92], v[2:3], v[69:72] lit v_mov_b32_e32 v2, v141 v_mov_b32_e32 v3, v142 v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[91:92], v[74:75], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[57:60], v[89:90], v[79:80], v[57:60] lit v_mov_b32_e32 v75, v84 v_mmac_f32_16x16x32_fp8_fp8 v[45:48], v[89:90], v[2:3], v[45:48] lit v_mov_b32_e32 v2, v149 v_mov_b32_e32 v3, v100 v_mmac_f32_16x16x32_fp8_fp8 v[57:60], v[91:92], v[75:76], v[57:60] lit s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[45:48], v[91:92], v[2:3], v[45:48] lit s_add_i32 s36, s79, -3 v_mov_b32_e32 v0, v4 v_mov_b32_e32 v135, v95 .LBB11_22: ; %Flow261 ; in Loop: Header=BB11_5 Depth=1 s_mov_b32 s24, s36 s_cmp_lt_i32 s24, s80 s_cbranch_scc0 .LBB11_27 .LBB11_23: ; in Loop: Header=BB11_5 Depth=1 v_mov_b32_e32 v3, v0 v_mov_b32_e32 v2, v135 .LBB11_24: ; %._crit_edge.i ; in Loop: Header=BB11_5 Depth=1 s_cmp_eq_u32 s80, 0 s_cselect_b64 s[20:21], -1, 0 s_add_i32 s24, s78, 63 s_ashr_i32 s25, s24, 31 s_lshr_b32 s25, s25, 26 s_add_i32 s24, s24, s25 s_ashr_i32 s24, s24, 6 s_cmp_eq_u32 s79, s24 s_cselect_b64 s[24:25], -1, 0 s_and_b64 s[20:21], s[20:21], s[24:25] s_andn2_b64 vcc, exec, s[20:21] s_mov_b64 s[24:25], -1 s_cbranch_vccnz .LBB11_34 ; %bb.25: ; %Flow257 ; in Loop: Header=BB11_5 Depth=1 s_and_b64 vcc, exec, s[24:25] s_cbranch_vccz .LBB11_4 s_branch .LBB11_43 .LBB11_26: ; in Loop: Header=BB11_5 Depth=1 s_mov_b32 s50, s48 s_mov_b32 s51, s48 s_mov_b32 s49, s48 v_mov_b64_e32 v[59:60], s[50:51] v_mov_b64_e32 v[57:58], s[48:49] v_mov_b64_e32 v[53:54], v[57:58] v_mov_b64_e32 v[49:50], v[57:58] v_mov_b64_e32 v[41:42], v[57:58] v_mov_b64_e32 v[45:46], v[57:58] v_mov_b64_e32 v[71:72], v[59:60] v_mov_b64_e32 v[67:68], v[59:60] v_mov_b64_e32 v[63:64], v[59:60] v_mov_b32_e32 v135, 0 v_mov_b64_e32 v[55:56], v[59:60] v_mov_b64_e32 v[51:52], v[59:60] v_mov_b64_e32 v[43:44], v[59:60] v_mov_b64_e32 v[47:48], v[59:60] v_mov_b64_e32 v[69:70], v[57:58] v_mov_b64_e32 v[65:66], v[57:58] v_mov_b64_e32 v[61:62], v[57:58] s_cmp_lt_i32 s24, s80 s_cbranch_scc1 .LBB11_23 .LBB11_27: ; %.lr.ph634.i ; in Loop: Header=BB11_5 Depth=1 s_add_i32 s21, 0, 0x4000 s_ashr_i32 s20, s86, 31 v_add_u32_e32 v136, s21, v97 s_mul_i32 s21, s86, s53 s_mul_hi_u32 s25, s86, s52 s_add_i32 s21, s25, s21 s_mul_i32 s20, s20, s52 s_add_i32 s90, s21, s20 s_lshl_b32 s20, s24, 6 s_ashr_i32 s25, s24, 31 s_add_i32 s49, s82, 0x2000 s_add_i32 s50, s82, 0x3000 s_add_i32 s51, s82, 0x4000 s_add_i32 s87, s82, 0x5000 s_add_i32 s88, s82, 0x6000 s_add_i32 s89, s82, 0x7000 s_add_i32 s91, s24, 1 s_sub_i32 s92, s78, s20 s_lshl_b64 s[20:21], s[24:25], 2 s_add_u32 s24, s85, s20 v_add_u32_e32 v137, s71, v97 v_add_u32_e32 v138, s72, v97 v_add_u32_e32 v139, s73, v97 v_add_u32_e32 v140, 0, v97 v_add_u32_e32 v141, s75, v97 v_add_u32_e32 v142, s76, v97 v_add_u32_e32 v143, s77, v97 s_mul_i32 s86, s86, s52 s_addc_u32 s25, s84, s21 ; implicit-def: $vgpr144 .LBB11_28: ; Parent Loop BB11_5 Depth=1 ; => This Inner Loop Header: Depth=2 s_mov_b64 s[20:21], s[24:25] ;;#ASMSTART s_load_dword s36, s[20:21], 0x0 s_waitcnt lgkmcnt(0) ;;#ASMEND s_ashr_i32 s20, s36, 31 s_mul_i32 s21, s36, s57 s_mul_hi_u32 s37, s36, s56 s_add_i32 s21, s37, s21 s_mul_i32 s20, s20, s56 s_add_i32 s21, s21, s20 s_mul_i32 s36, s36, s56 s_add_u32 s44, s67, s36 s_addc_u32 s45, s68, s21 s_add_u32 s20, s44, s86 s_addc_u32 s21, s45, s90 s_nop 0 matrix_load_64x16_b8 s[20:23] s82 t r lds matrix_load_64x16_b8 s[20:23] s83 moffset:64 t r lds v_readfirstlane_b32 s36, v108 matrix_load_64x16_b8 s[20:23] s49 moffset:128 t r lds v_lshl_or_b32 v4, s36, 4, v109 matrix_load_64x16_b8 s[20:23] s50 moffset:192 t r lds v_mad_u64_u32 v[2:3], s[36:37], v4, s52, v[101:102] matrix_load_64x16_b8 s[20:23] s51 moffset:256 t r lds matrix_load_64x16_b8 s[20:23] s87 moffset:320 t r lds matrix_load_64x16_b8 s[20:23] s88 moffset:384 t r lds v_cmp_gt_i32_e32 vcc, s92, v4 matrix_load_64x16_b8 s[20:23] s89 moffset:448 t r lds v_cndmask_b32_e32 v2, -1, v2, vcc buffer_load_dwordx4 v[85:88], v2, s[44:47], 0 offen ;;#ASMSTART s_waitcnt vmcnt(8) s_barrier ;;#ASMEND s_mov_b32 m0, s81 s_nop 0 ds_read_matrix_trans_format v[77:80], m0 element:1 row:3 col:1 v_mov_b32_e32 v3, v1 v_mov_b32_e32 v4, v1 v_mov_b32_e32 v2, v1 v_mov_b64_e32 v[75:76], v[3:4] v_mov_b64_e32 v[73:74], v[1:2] s_waitcnt lgkmcnt(0) s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[5:6], v[77:78], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[7:8], v[79:80], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(7) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:4096 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[9:10], v[77:78], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[11:12], v[79:80], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(6) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:8192 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[13:14], v[77:78], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[15:16], v[79:80], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(5) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:12288 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[17:18], v[77:78], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[19:20], v[79:80], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:16384 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[21:22], v[77:78], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[23:24], v[79:80], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:20480 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[25:26], v[77:78], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[27:28], v[79:80], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:24576 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[29:30], v[77:78], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[31:32], v[79:80], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[89:92], m0 offset:28672 element:1 row:3 col:1 ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[77:79:81:83], v136 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[78:80:82:84], v137 ds_read_m64x16_b8_alt4 v[93:95:97:99], v138 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[94:96:98:100], v139 ; sched_barrier mask(0x00000000) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[33:34], v[89:90], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[35:36], v[91:92], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(0) ;;#ASMEND s_waitcnt vmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[37:38], v[85:86], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[39:40], v[87:88], v[73:76] lit ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND v_mbcnt_hi_u32_b32 v3, -1, v133 v_and_b32_e32 v4, 63, v3 v_and_b32_e32 v86, 64, v3 v_xor_b32_e32 v85, 32, v4 v_add_u32_e32 v86, 64, v86 v_cmp_lt_i32_e32 vcc, v85, v86 v_max3_f32 v2, v135, v73, v74 v_cndmask_b32_e32 v85, v3, v85, vcc v_max3_f32 v2, v2, v75, v76 v_lshlrev_b32_e32 v85, 2, v85 ds_bpermute_b32 v85, v85, v2 v_xor_b32_e32 v4, 16, v4 v_cmp_lt_i32_e32 vcc, v4, v86 v_cndmask_b32_e32 v3, v3, v4, vcc v_lshlrev_b32_e32 v3, 2, v3 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v85 ds_bpermute_b32 v3, v3, v2 s_and_saveexec_b64 s[36:37], s[2:3] s_cbranch_execz .LBB11_30 ; %bb.29: ; in Loop: Header=BB11_28 Depth=2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v3 ds_write_b32 v115, v2 .LBB11_30: ; in Loop: Header=BB11_28 Depth=2 s_or_b64 exec, exec, s[36:37] v_mov_b32_e32 v92, v83 v_mov_b32_e32 v91, v81 v_mov_b32_e32 v90, v79 v_mov_b32_e32 v89, v77 v_mov_b32_e32 v84, v84 v_mov_b32_e32 v83, v82 v_mov_b32_e32 v82, v80 v_mov_b32_e32 v81, v78 v_mov_b32_e32 v85, v93 v_mov_b32_e32 v77, v94 v_mov_b32_e32 v86, v95 v_mov_b32_e32 v87, v97 v_mov_b32_e32 v88, v99 v_mov_b32_e32 v78, v96 v_mov_b32_e32 v79, v98 v_mov_b32_e32 v80, v100 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[36:37], s[4:5] s_cbranch_execz .LBB11_32 ; %bb.31: ; in Loop: Header=BB11_28 Depth=2 ds_read_b128 v[93:96], v116 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v95, v96 v_max3_f32 v2, v93, v94, v2 ds_write_b32 v117, v2 offset:256 .LBB11_32: ; %_ZN5flash7SoftmaxILi1EE21softmax_rescale_o_fp8ILb0ELb1EN4cute6TensorINS3_13array_alignedIfLm4ELm16EEENS3_6LayoutINS3_5tupleIJNS3_1CILi4EEENS9_ILi1EEESB_EEENS8_IJSB_NS9_ILi0EEESD_EEEEEEENS4_INS3_10ViewEngineINS3_8smem_ptrIfEEEENS7_INS8_IJNS9_ILi128EEEEEENS8_IJSB_EEEEEEEEEvRT1_RT2_fPDv4_f.exit1080.i ; in Loop: Header=BB11_28 Depth=2 s_or_b64 exec, exec, s[36:37] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v2, v117 offset:256 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, s74, v2 v_cndmask_b32_e64 v3, v2, 0, vcc v_sub_f32_e32 v3, v135, v3 v_mul_f32_e32 v3, v3, v107 s_nop 0 v_exp_f32_e32 v153, v3 v_mul_f32_e32 v3, v2, v107 v_cndmask_b32_e64 v3, v3, 0, vcc v_fma_f32 v4, v73, v107, -v3 v_mov_b32_e32 v154, v153 v_exp_f32_e32 v135, v4 v_fma_f32 v4, v74, v107, -v3 v_pk_mul_f32 v[61:62], v[153:154], v[61:62] v_exp_f32_e32 v155, v4 v_fma_f32 v4, v75, v107, -v3 v_fma_f32 v3, v76, v107, -v3 v_exp_f32_e32 v156, v4 v_exp_f32_e32 v157, v3 v_mov_b32_e32 v3, v135 v_cvt_pk_fp8_f32 v3, v3, v155, v144 v_mov_b32_e32 v144, v156 v_pk_mul_f32 v[63:64], v[153:154], v[63:64] v_pk_mul_f32 v[65:66], v[153:154], v[65:66] v_cvt_pk_fp8_f32 v144, v144, v157, v3 op_sel:[0,0,0,1] v_add_u32_e32 v3, 0, v118 v_pk_mul_f32 v[67:68], v[153:154], v[67:68] v_pk_mul_f32 v[69:70], v[153:154], v[69:70] ds_write_b32 v3, v144 offset:28672 v_add_u32_e32 v3, 0, v112 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b128 v[73:76], v3 offset:28672 v_pk_mul_f32 v[71:72], v[153:154], v[71:72] v_pk_mul_f32 v[45:46], v[153:154], v[45:46] v_pk_mul_f32 v[47:48], v[153:154], v[47:48] v_pk_mul_f32 v[41:42], v[153:154], v[41:42] v_pk_mul_f32 v[43:44], v[153:154], v[43:44] v_pk_mul_f32 v[49:50], v[153:154], v[49:50] v_pk_mul_f32 v[51:52], v[153:154], v[51:52] v_pk_mul_f32 v[53:54], v[153:154], v[53:54] v_pk_mul_f32 v[55:56], v[153:154], v[55:56] v_pk_mul_f32 v[57:58], v[153:154], v[57:58] v_pk_mul_f32 v[59:60], v[153:154], v[59:60] ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[93:95:97:99], v140 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[94:96:98:100], v141 ds_read_m64x16_b8_alt4 v[145:147:149:151], v142 s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[146:148:150:152], v143 ; sched_barrier mask(0x00000000) v_mov_b32_e32 v3, v89 v_mov_b32_e32 v4, v81 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[41:44], v[73:74], v[3:4], v[41:44] lit v_mov_b32_e32 v3, v85 v_mov_b32_e32 v4, v77 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[41:44], v[75:76], v[3:4], v[41:44] lit v_mov_b32_e32 v3, v93 v_mov_b32_e32 v4, v94 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[61:64], v[73:74], v[3:4], v[61:64] lit v_mov_b32_e32 v3, v145 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v4, v146 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[61:64], v[75:76], v[3:4], v[61:64] lit v_mov_b32_e32 v3, v95 v_mov_b32_e32 v4, v96 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[65:68], v[73:74], v[3:4], v[65:68] lit v_mov_b32_e32 v3, v147 v_mov_b32_e32 v4, v148 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[65:68], v[75:76], v[3:4], v[65:68] lit v_mov_b32_e32 v3, v97 v_mov_b32_e32 v4, v98 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[69:72], v[73:74], v[3:4], v[69:72] lit v_mov_b32_e32 v3, v149 v_mov_b32_e32 v4, v150 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[69:72], v[75:76], v[3:4], v[69:72] lit v_mov_b32_e32 v3, v99 v_mov_b32_e32 v4, v100 v_mov_b32_e32 v81, v90 s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[45:48], v[73:74], v[3:4], v[45:48] lit v_mov_b32_e32 v3, v151 v_mov_b32_e32 v4, v152 v_mmac_f32_16x16x32_fp8_fp8 v[49:52], v[73:74], v[81:82], v[49:52] lit v_mov_b32_e32 v77, v86 v_mov_b32_e32 v82, v91 v_mmac_f32_16x16x32_fp8_fp8 v[45:48], v[75:76], v[3:4], v[45:48] lit v_add_f32_e32 v3, v155, v135 s_add_i32 s91, s91, -1 s_add_i32 s92, s92, 64 v_mmac_f32_16x16x32_fp8_fp8 v[49:52], v[75:76], v[77:78], v[49:52] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[73:74], v[82:83], v[53:56] lit v_mov_b32_e32 v78, v87 v_mov_b32_e32 v83, v92 v_add_f32_e32 v3, v3, v156 s_add_u32 s24, s24, -4 v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[75:76], v[78:79], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[57:60], v[73:74], v[83:84], v[57:60] lit v_mov_b32_e32 v79, v88 v_add_f32_e32 v3, v3, v157 s_addc_u32 s25, s25, -1 v_mmac_f32_16x16x32_fp8_fp8 v[57:60], v[75:76], v[79:80], v[57:60] lit s_cmp_gt_i32 s91, s80 v_fmac_f32_e32 v3, v153, v0 s_cbranch_scc0 .LBB11_24 ; %bb.33: ; in Loop: Header=BB11_28 Depth=2 v_mov_b32_e32 v0, v3 v_mov_b32_e32 v135, v2 s_branch .LBB11_28 .LBB11_34: ; in Loop: Header=BB11_5 Depth=1 s_add_u32 s20, s40, s62 s_addc_u32 s21, s41, s63 global_load_dword v5, v1, s[20:21] v_mbcnt_hi_u32_b32 v0, -1, v133 v_and_b32_e32 v4, 63, v0 v_and_b32_e32 v7, 64, v0 v_xor_b32_e32 v6, 32, v4 v_add_u32_e32 v7, 64, v7 v_cmp_lt_i32_e32 vcc, v6, v7 v_cndmask_b32_e32 v6, v0, v6, vcc v_lshlrev_b32_e32 v6, 2, v6 ds_bpermute_b32 v6, v6, v3 v_xor_b32_e32 v4, 16, v4 v_cmp_lt_i32_e32 vcc, v4, v7 v_cndmask_b32_e32 v0, v0, v4, vcc v_lshlrev_b32_e32 v4, 2, v0 s_waitcnt lgkmcnt(0) v_add_f32_e32 v0, v6, v3 ds_bpermute_b32 v4, v4, v0 s_waitcnt vmcnt(0) v_readfirstlane_b32 s36, v5 s_and_saveexec_b64 s[24:25], s[2:3] s_cbranch_execz .LBB11_36 ; %bb.35: ; in Loop: Header=BB11_5 Depth=1 s_waitcnt lgkmcnt(0) v_add_f32_e32 v0, v0, v4 ds_write_b32 v123, v0 .LBB11_36: ; in Loop: Header=BB11_5 Depth=1 s_or_b64 exec, exec, s[24:25] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[24:25], s[4:5] s_cbranch_execz .LBB11_38 ; %bb.37: ; in Loop: Header=BB11_5 Depth=1 ds_read_b128 v[4:7], v124 s_waitcnt lgkmcnt(0) v_pk_add_f32 v[4:5], v[6:7], v[4:5] v_add_f32_e32 v0, v5, v4 ds_write_b32 v125, v0 offset:256 .LBB11_38: ; %_ZN5flash7SoftmaxILi1EE29normalize_softmax_lse_fp8_tp1ILb0ELb1ELb0EN4cute6TensorINS3_10ViewEngineINS3_8smem_ptrIfEEEENS3_6LayoutINS3_5tupleIJNS3_1CILi128EEEEEENSA_IJNSB_ILi1EEEEEEEEEEEENS4_INS3_13array_alignedIfLm1ELm16EEENS9_ISF_NSA_IJNSB_ILi0EEEEEEEEEEPDv4_fRT2_fff.exit.i ; in Loop: Header=BB11_5 Depth=1 s_or_b64 exec, exec, s[24:25] s_and_b64 s[8:9], s[8:9], exec s_cselect_b32 s8, s34, 0 s_add_i32 s8, s36, s8 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v0, v125 offset:256 v_readlane_b32 s36, v158, 0 v_readlane_b32 s37, v158, 1 s_mul_i32 s8, s8, s37 s_add_i32 s8, s8, s7 s_mul_i32 s8, s8, s35 s_add_i32 s24, s8, s6 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, 0, v0 v_readlane_b32 s38, v158, 2 v_readlane_b32 s39, v158, 3 s_and_saveexec_b64 s[36:37], s[58:59] s_cbranch_execz .LBB11_40 ; %bb.39: ; in Loop: Header=BB11_5 Depth=1 v_log_f32_e32 v4, v0 s_ashr_i32 s25, s24, 31 v_mov_b32_e32 v5, 0xff800000 s_lshl_b64 s[8:9], s[24:25], 2 v_mul_f32_e32 v4, 0x3f317218, v4 v_fmac_f32_e32 v4, v2, v106 v_cndmask_b32_e32 v6, v4, v5, vcc v_mov_b32_e32 v5, s9 v_add_co_u32_e64 v4, s[8:9], s8, v126 v_addc_co_u32_e64 v5, s[8:9], v127, v5, s[8:9] global_store_dword v[4:5], v6, off .LBB11_40: ; %.loopexit611.i ; in Loop: Header=BB11_5 Depth=1 s_or_b64 exec, exec, s[36:37] s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB11_42 ; %bb.41: ; %.preheader608.i ; in Loop: Header=BB11_5 Depth=1 v_rcp_f32_e32 v0, v0 v_readlane_b32 s36, v158, 0 v_mov_b32_e32 v4, v61 v_mov_b32_e32 v5, v65 v_mul_f32_e32 v0, s33, v0 v_cndmask_b32_e64 v30, v0, 1.0, vcc v_mov_b32_e32 v31, v30 s_mul_i32 s20, s24, s36 v_pk_mul_f32 v[32:33], v[30:31], v[4:5] v_mov_b32_e32 v4, v63 v_mov_b32_e32 v5, v67 v_mov_b32_e32 v18, v42 v_mov_b32_e32 v19, v50 s_ashr_i32 s21, s20, 31 v_pk_mul_f32 v[10:11], v[30:31], v[4:5] v_mov_b32_e32 v4, v64 v_mov_b32_e32 v5, v68 v_pk_mul_f32 v[20:21], v[30:31], v[18:19] v_mov_b32_e32 v18, v43 v_mov_b32_e32 v19, v51 s_lshl_b64 s[20:21], s[20:21], 2 v_pk_mul_f32 v[14:15], v[30:31], v[4:5] v_mov_b32_e32 v4, v69 v_mov_b32_e32 v5, v45 v_mov_b32_e32 v16, v72 v_mov_b32_e32 v17, v48 v_pk_mul_f32 v[24:25], v[30:31], v[18:19] v_mov_b32_e32 v18, v44 v_mov_b32_e32 v19, v52 s_add_u32 s20, s54, s20 v_mov_b32_e32 v6, v62 v_mov_b32_e32 v7, v66 v_pk_mul_f32 v[4:5], v[30:31], v[4:5] v_mov_b32_e32 v8, v70 v_mov_b32_e32 v9, v46 v_mov_b32_e32 v12, v71 v_mov_b32_e32 v13, v47 v_pk_mul_f32 v[34:35], v[30:31], v[16:17] v_mov_b32_e32 v16, v41 v_mov_b32_e32 v17, v49 v_pk_mul_f32 v[28:29], v[30:31], v[18:19] v_mov_b32_e32 v18, v53 v_mov_b32_e32 v19, v57 v_mov_b32_e32 v22, v54 v_mov_b32_e32 v23, v58 v_mov_b32_e32 v26, v55 v_mov_b32_e32 v27, v59 v_mov_b32_e32 v36, v56 v_mov_b32_e32 v37, v60 s_addc_u32 s21, s55, s21 v_pk_mul_f32 v[6:7], v[30:31], v[6:7] v_pk_mul_f32 v[8:9], v[30:31], v[8:9] v_pk_mul_f32 v[12:13], v[30:31], v[12:13] v_pk_mul_f32 v[16:17], v[30:31], v[16:17] v_pk_mul_f32 v[18:19], v[30:31], v[18:19] v_pk_mul_f32 v[22:23], v[30:31], v[22:23] v_pk_mul_f32 v[26:27], v[30:31], v[26:27] v_pk_mul_f32 v[30:31], v[30:31], v[36:37] v_readlane_b32 s37, v158, 1 v_readlane_b32 s38, v158, 2 v_readlane_b32 s39, v158, 3 global_store_dwordx2 v130, v[32:33], s[20:21] global_store_dwordx4 v131, v[4:7], s[20:21] offset:8 global_store_dwordx4 v131, v[8:11], s[20:21] offset:24 global_store_dwordx4 v131, v[12:15], s[20:21] offset:40 global_store_dwordx2 v131, v[34:35], s[20:21] offset:56 global_store_dwordx4 v131, v[16:19], s[20:21] offset:1024 global_store_dwordx4 v132, v[20:23], s[20:21] offset:16 global_store_dwordx4 v132, v[24:27], s[20:21] offset:32 global_store_dwordx4 v132, v[28:31], s[20:21] offset:48 .LBB11_42: ; %Flow ; in Loop: Header=BB11_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_mov_b64 s[24:25], 0 s_branch .LBB11_4 .LBB11_43: ; in Loop: Header=BB11_5 Depth=1 v_mbcnt_hi_u32_b32 v0, -1, v133 v_and_b32_e32 v4, 63, v0 v_and_b32_e32 v6, 64, v0 v_xor_b32_e32 v5, 32, v4 v_add_u32_e32 v6, 64, v6 v_cmp_lt_i32_e32 vcc, v5, v6 v_cndmask_b32_e32 v5, v0, v5, vcc v_lshlrev_b32_e32 v5, 2, v5 ds_bpermute_b32 v5, v5, v3 v_xor_b32_e32 v4, 16, v4 v_cmp_lt_i32_e32 vcc, v4, v6 v_cndmask_b32_e32 v0, v0, v4, vcc v_lshlrev_b32_e32 v4, 2, v0 s_waitcnt lgkmcnt(0) v_add_f32_e32 v0, v5, v3 ds_bpermute_b32 v3, v4, v0 s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execz .LBB11_45 ; %bb.44: ; in Loop: Header=BB11_5 Depth=1 s_waitcnt lgkmcnt(0) v_add_f32_e32 v0, v0, v3 ds_write_b32 v123, v0 .LBB11_45: ; in Loop: Header=BB11_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[8:9], s[4:5] s_cbranch_execz .LBB11_47 ; %bb.46: ; in Loop: Header=BB11_5 Depth=1 ds_read_b128 v[3:6], v124 s_waitcnt lgkmcnt(0) v_pk_add_f32 v[3:4], v[5:6], v[3:4] v_add_f32_e32 v0, v4, v3 ds_write_b32 v125, v0 offset:256 .LBB11_47: ; %_ZN5flash7SoftmaxILi1EE25normalize_softmax_lse_fp8ILb0ELb0EN4cute6TensorINS3_10ViewEngineINS3_8smem_ptrIfEEEENS3_6LayoutINS3_5tupleIJNS3_1CILi128EEEEEENSA_IJNSB_ILi1EEEEEEEEEEEENS4_INS3_13array_alignedIfLm1ELm16EEENS9_ISF_NSA_IJNSB_ILi0EEEEEEEEEEPDv4_fRT1_fff.exit.i ; in Loop: Header=BB11_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v0, v125 offset:256 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, 0, v0 s_and_saveexec_b64 s[24:25], s[58:59] s_cbranch_execz .LBB11_49 ; %bb.48: ; in Loop: Header=BB11_5 Depth=1 v_readlane_b32 s36, v158, 0 v_readlane_b32 s37, v158, 1 s_mul_i32 s8, s18, s37 v_log_f32_e32 v3, v0 s_add_i32 s8, s8, s7 s_mul_i32 s8, s8, s35 s_add_i32 s8, s8, s6 s_ashr_i32 s9, s8, 31 v_mul_f32_e32 v3, 0x3f317218, v3 v_fmac_f32_e32 v3, v2, v106 v_mov_b32_e32 v2, 0x7f800000 s_lshl_b64 s[8:9], s[8:9], 2 v_cndmask_b32_e32 v4, v3, v2, vcc v_mov_b32_e32 v3, s9 v_add_co_u32_e64 v2, s[8:9], s8, v128 v_addc_co_u32_e64 v3, s[8:9], v129, v3, s[8:9] v_readlane_b32 s38, v158, 2 v_readlane_b32 s39, v158, 3 global_store_dword v[2:3], v4, off .LBB11_49: ; %.loopexit.i ; in Loop: Header=BB11_5 Depth=1 s_or_b64 exec, exec, s[24:25] s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB11_3 ; %bb.50: ; %.preheader.i ; in Loop: Header=BB11_5 Depth=1 v_rcp_f32_e32 v0, v0 s_mul_i32 s20, s18, s66 s_mul_hi_u32 s21, s18, s95 s_add_i32 s20, s21, s20 s_mul_i32 s19, s19, s95 s_add_i32 s19, s20, s19 s_mul_i32 s20, s18, s95 v_mul_f32_e32 v0, s33, v0 s_add_u32 s20, s69, s20 v_cndmask_b32_e64 v0, v0, 1.0, vcc s_addc_u32 s21, s70, s19 v_mul_f32_e32 v2, v0, v61 v_mul_f32_e32 v4, v0, v62 v_mul_f32_e32 v3, v0, v65 v_mul_f32_e32 v5, v0, v66 v_mul_f32_e32 v10, v0, v69 v_mul_f32_e32 v11, v0, v70 v_mul_f32_e32 v14, v0, v45 v_mul_f32_e32 v15, v0, v46 s_lshl_b64 s[20:21], s[20:21], 1 v_mul_f32_e32 v6, v0, v63 v_mul_f32_e32 v8, v0, v64 v_mul_f32_e32 v7, v0, v67 v_mul_f32_e32 v9, v0, v68 v_mul_f32_e32 v12, v0, v71 v_mul_f32_e32 v16, v0, v47 s_add_u32 s19, s16, s20 v_cvt_pk_bf16_f32 v2, v2, v3 v_cvt_pk_bf16_f32 v3, v10, v14 v_cvt_pk_bf16_f32 v4, v4, v5 v_cvt_pk_bf16_f32 v5, v11, v15 v_mul_f32_e32 v13, v0, v72 v_mul_f32_e32 v17, v0, v48 s_addc_u32 s20, s17, s21 v_cvt_pk_bf16_f32 v6, v6, v7 v_cvt_pk_bf16_f32 v7, v12, v16 v_cvt_pk_bf16_f32 v8, v8, v9 v_mov_b32_e32 v14, s20 v_cvt_pk_bf16_f32 v15, v13, v17 v_add_co_u32_e32 v10, vcc, s19, v102 v_mul_f32_e32 v18, v0, v41 v_mul_f32_e32 v19, v0, v42 v_mul_f32_e32 v22, v0, v49 v_mul_f32_e32 v23, v0, v50 v_mul_f32_e32 v26, v0, v53 v_mul_f32_e32 v30, v0, v57 v_addc_co_u32_e32 v11, vcc, v14, v103, vcc v_mul_f32_e32 v27, v0, v54 v_mul_f32_e32 v31, v0, v58 global_store_dwordx4 v[10:11], v[2:5], off global_store_dwordx3 v[10:11], v[6:8], off offset:16 global_store_short v[10:11], v15, off offset:28 v_cvt_pk_bf16_f32 v2, v18, v22 v_cvt_pk_bf16_f32 v3, v26, v30 v_cvt_pk_bf16_f32 v4, v19, v23 v_mul_f32_e32 v20, v0, v43 v_mul_f32_e32 v21, v0, v44 v_mul_f32_e32 v24, v0, v51 v_mul_f32_e32 v25, v0, v52 v_mul_f32_e32 v28, v0, v55 v_mul_f32_e32 v29, v0, v56 v_mul_f32_e32 v32, v0, v59 v_mul_f32_e32 v0, v0, v60 v_cvt_pk_bf16_f32 v5, v27, v31 v_mov_b32_e32 v13, s20 v_cvt_pk_bf16_f32 v6, v20, v24 v_cvt_pk_bf16_f32 v7, v28, v32 v_cvt_pk_bf16_f32 v8, v21, v25 v_cvt_pk_bf16_f32 v9, v29, v0 v_add_co_u32_e32 v12, vcc, s19, v104 v_addc_co_u32_e32 v13, vcc, v13, v105, vcc global_store_short_d16_hi v[12:13], v15, off global_store_dwordx4 v[10:11], v[2:5], off offset:512 global_store_dwordx4 v[10:11], v[6:9], off offset:528 s_branch .LBB11_3 .LBB11_51: ; %.loopexit s_endpgm .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params .amdhsa_group_segment_fixed_size 0 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 336 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 1 .amdhsa_system_sgpr_workgroup_id_z 1 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 159 .amdhsa_next_free_sgpr 96 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_bf16_overflow 0 .amdhsa_bf8_overflow 1 .amdhsa_fp8_overflow 1 .amdhsa_bf16_denorm_mode 3 .amdhsa_load_store_out_of_order 0 .amdhsa_matrix_excp 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params,#alloc,#execinstr .Lfunc_end11: .size _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params, .Lfunc_end11-_ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 9340 ; NumSgprs: 100 ; NumVgprs: 159 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 0 bytes/workgroup (compile time only) ; SGPRBlocks: 12 ; VGPRBlocks: 39 ; NumSGPRsForWavesPerEU: 100 ; NumVGPRsForWavesPerEU: 159 ; Occupancy: 1 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF16_OVFL: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:FP8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:BF16_DENORM: 3 .section .text._ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params,#alloc,#execinstr .protected _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params ; -- Begin function _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params .globl _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params .p2align 8 .type _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params,@function _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params: ; @_ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params ; %bb.0: s_load_dwordx2 s[0:1], s[4:5], 0x110 s_load_dwordx2 s[34:35], s[4:5], 0x0 s_lshl_b32 s2, s8, 3 s_ashr_i32 s3, s2, 31 s_lshl_b64 s[2:3], s[2:3], 2 s_waitcnt lgkmcnt(0) s_add_u32 s24, s0, s2 s_addc_u32 s25, s1, s3 s_load_dwordx4 s[12:15], s[24:25], 0x0 s_waitcnt lgkmcnt(0) s_cmp_ge_i32 s12, s34 s_cbranch_scc1 .LBB12_45 ; %bb.1: s_cmp_gt_i32 s12, s14 s_cbranch_scc1 .LBB12_45 ; %bb.2: ; %.lr.ph s_load_dwordx4 s[16:19], s[4:5], 0x58 s_load_dwordx4 s[0:3], s[4:5], 0x140 s_load_dwordx2 s[52:53], s[4:5], 0xb0 s_load_dwordx4 s[36:39], s[4:5], 0xc0 s_load_dwordx2 s[28:29], s[4:5], 0xe0 s_load_dwordx4 s[8:11], s[4:5], 0xf0 s_load_dwordx2 s[54:55], s[4:5], 0x130 s_waitcnt lgkmcnt(0) s_load_dword s26, s[0:1], 0x0 s_load_dword s33, s[2:3], 0x0 s_load_dwordx4 s[20:23], s[4:5], 0xc s_load_dwordx2 s[30:31], s[4:5], 0x20 s_load_dword s34, s[24:25], 0x10 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v1, s26 s_load_dwordx4 s[24:27], s[4:5], 0x90 s_load_dwordx2 s[58:59], s[4:5], 0x100 s_ashr_i32 s0, s13, 31 s_lshr_b32 s0, s0, 26 s_lshl_b32 s6, s6, 4 s_add_i32 s0, s13, s0 s_ashr_i32 s2, s6, 31 s_ashr_i32 s13, s0, 6 s_waitcnt lgkmcnt(0) s_mul_i32 s0, s26, s2 s_mul_hi_u32 s1, s26, s6 s_ashr_i32 s3, s7, 31 s_add_i32 s0, s1, s0 s_mul_i32 s1, s27, s6 s_mul_hi_u32 s23, s38, s7 s_mul_i32 s27, s38, s3 s_add_i32 s23, s23, s27 s_mul_i32 s27, s39, s7 s_add_i32 s0, s0, s1 s_mul_i32 s1, s26, s6 s_add_i32 s23, s23, s27 s_mul_i32 s27, s38, s7 s_add_u32 s62, s27, s1 s_addc_u32 s63, s23, s0 s_ashr_i32 s0, s22, 31 s_add_i32 s1, s22, s0 s_xor_b32 s1, s1, s0 v_cvt_f32_u32_e32 v2, s1 v_mul_f32_e32 v1, s33, v1 v_mul_f32_e32 v106, s30, v1 v_mul_f32_e32 v107, s31, v1 v_rcp_iflag_f32_e32 v1, v2 s_mov_b32 s64, s24 s_sub_i32 s23, 0, s1 s_add_i32 s22, s7, s3 v_mul_f32_e32 v1, 0x4f7ffffe, v1 v_cvt_u32_f32_e32 v1, v1 s_xor_b32 s22, s22, s3 s_mov_b32 s65, s25 s_xor_b32 s0, s3, s0 v_readfirstlane_b32 s24, v1 s_mul_i32 s23, s23, s24 s_mul_hi_u32 s23, s24, s23 s_add_i32 s24, s24, s23 s_mul_hi_u32 s23, s22, s24 s_mul_i32 s24, s23, s1 s_sub_i32 s22, s22, s24 s_add_i32 s24, s23, 1 s_sub_i32 s25, s22, s1 s_cmp_ge_u32 s22, s1 s_cselect_b32 s23, s24, s23 s_cselect_b32 s22, s25, s22 s_add_i32 s24, s23, 1 s_cmp_ge_u32 s22, s1 s_cselect_b32 s1, s24, s23 s_xor_b32 s1, s1, s0 s_sub_i32 s22, s1, s0 s_ashr_i32 s0, s22, 31 s_mul_i32 s0, s28, s0 s_mul_hi_u32 s1, s28, s22 s_add_i32 s23, s1, s0 s_load_dwordx2 s[0:1], s[4:5], 0x48 v_and_b32_e32 v109, 15, v0 v_mul_lo_u32 v1, v109, s26 s_mul_i32 s24, s29, s22 v_lshrrev_b32_e32 v3, 3, v0 s_add_i32 s23, s23, s24 s_mul_i32 s22, s28, s22 s_load_dwordx4 s[28:31], s[4:5], 0x28 v_add_lshl_u32 v3, v3, v0, 4 s_waitcnt lgkmcnt(0) s_add_u32 s66, s0, s22 v_add_u32_e32 v111, 0x200, v1 v_lshlrev_b32_e32 v1, 4, v0 v_lshrrev_b32_e32 v2, 4, v0 v_and_b32_e32 v3, 48, v3 s_movk_i32 s22, 0x3c0 v_lshrrev_b32_e32 v108, 6, v0 v_or_b32_e32 v2, v1, v2 v_and_b32_e32 v112, 0x3f0, v1 v_and_or_b32 v113, v1, s22, v3 v_lshrrev_b32_e32 v1, 2, v0 v_lshlrev_b32_e32 v3, 4, v109 s_addc_u32 s67, s1, s23 v_and_b32_e32 v1, 12, v1 v_lshl_or_b32 v4, v108, 2, v3 s_add_i32 s22, 0, 0x7600 s_mul_i32 s2, s36, s2 s_mul_hi_u32 s24, s36, s6 v_lshl_or_b32 v114, v108, 4, v1 v_add_u32_e32 v115, s22, v4 v_add_u32_e32 v116, s22, v3 v_mul_i32_i24_e32 v5, -3, v109 s_add_i32 s22, 0, 0x7400 v_lshl_or_b32 v1, v108, 5, v1 s_add_i32 s2, s24, s2 s_mul_i32 s24, s37, s6 v_lshlrev_b32_e32 v5, 2, v5 v_add_u32_e32 v123, s22, v3 v_cmp_eq_u32_e32 vcc, 0, v1 v_mov_b32_e32 v1, 0 s_add_i32 s24, s2, s24 s_mul_hi_u32 s2, s8, s7 s_mul_i32 s3, s8, s3 v_add_u32_e32 v117, v116, v5 v_and_b32_e32 v118, 0x3fc, v2 v_add_u32_e32 v124, v123, v5 v_and_b32_e32 v2, 0xf0, v0 s_add_i32 s2, s2, s3 s_mul_i32 s3, s9, s7 v_mov_b32_e32 v3, v1 v_or_b32_e32 v5, 15, v0 v_mov_b32_e32 v6, v1 s_sub_i32 s0, 16, s35 v_add_u32_e32 v122, s22, v4 s_add_i32 s9, s2, s3 v_mad_u64_u32 v[3:4], s[2:3], s36, v109, v[2:3] v_mul_lo_u32 v10, s37, v109 v_mad_u64_u32 v[5:6], s[2:3], s36, v109, v[5:6] s_load_dwordx4 s[40:43], s[4:5], 0x120 s_max_i32 s0, s0, 0 s_lshl_b32 s0, s0, 8 s_or_b32 s27, s0, 0x10000 s_sub_i32 s0, s35, s6 v_cmp_gt_i32_e64 s[0:1], s0, v109 v_add_u32_e32 v4, v10, v4 v_add_u32_e32 v6, v10, v6 v_lshlrev_b32_e32 v10, 2, v109 s_load_dwordx2 s[56:57], s[4:5], 0x80 s_and_b64 s[22:23], vcc, s[0:1] s_waitcnt lgkmcnt(0) v_mov_b32_e32 v11, s43 v_add_co_u32_e32 v125, vcc, s42, v10 s_load_dwordx2 s[42:43], s[4:5], 0x68 v_lshlrev_b32_e32 v7, 9, v109 s_mul_i32 s25, s36, s6 s_mul_i32 s8, s8, s7 v_addc_co_u32_e32 v126, vcc, 0, v11, vcc v_and_b32_e32 v110, 48, v0 v_or_b32_e32 v8, v7, v2 v_or_b32_e32 v9, 0x100, v2 s_add_u32 s68, s8, s25 v_mov_b32_e32 v11, s19 v_add_co_u32_e32 v127, vcc, s18, v10 v_lshlrev_b64 v[102:103], 1, v[3:4] v_lshlrev_b64 v[104:105], 1, v[5:6] v_mbcnt_lo_u32_b32 v132, -1, 0 s_mov_b32 s39, 0x10000 v_or_b32_e32 v101, 0x200, v110 s_mov_b32 s38, s52 v_or_b32_e32 v119, 1, v114 v_or_b32_e32 v120, 2, v114 v_or_b32_e32 v121, 3, v114 s_addc_u32 s69, s9, s24 v_addc_co_u32_e32 v128, vcc, 0, v11, vcc s_brev_b32 s46, 1 s_mov_b32 s47, 0x20000 s_add_i32 s70, 0, 0x4400 s_add_i32 s71, 0, 0x4800 s_add_i32 s72, 0, 0x4c00 s_mov_b32 s73, 0xff800000 s_add_i32 s74, 0, 0x400 v_lshlrev_b32_e32 v129, 2, v8 v_add_lshl_u32 v130, v7, v2, 2 v_add_lshl_u32 v131, v7, v9, 2 v_mbcnt_hi_u32_b32 v133, -1, v132 s_add_i32 s75, 0, 0x800 s_add_i32 s76, 0, 0xc00 v_cmp_eq_u32_e64 s[2:3], 0, v110 v_cmp_gt_u32_e64 s[4:5], 16, v0 s_mov_b32 s18, s12 s_mov_b32 s48, 0 s_branch .LBB12_5 .LBB12_3: ; %Flow234 ; in Loop: Header=BB12_5 Depth=1 s_or_b64 exec, exec, s[8:9] .LBB12_4: ; %_ZN5flash45compute_attn_1rowblock_splitkv_mla_fp8_gfx938I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_22SharedStorageMLAFloat8IS5_EEEEvRK20Flash_fwd_mla_paramsiiiiiiibRT1_fff.exit ; in Loop: Header=BB12_5 Depth=1 s_add_i32 s8, s18, 1 s_cmp_lt_i32 s18, s14 s_mov_b32 s18, s8 s_cbranch_scc0 .LBB12_45 .LBB12_5: ; =>This Loop Header: Depth=1 ; Child Loop BB12_22 Depth 2 s_ashr_i32 s19, s18, 31 s_lshl_b64 s[60:61], s[18:19], 2 s_add_u32 s8, s28, s60 s_addc_u32 s9, s29, s61 global_load_dword v0, v1, s[8:9] s_cmp_le_i32 s18, s12 s_waitcnt vmcnt(0) v_readfirstlane_b32 s77, v0 s_cbranch_scc1 .LBB12_7 ; %bb.6: ; in Loop: Header=BB12_5 Depth=1 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) .LBB12_7: ; in Loop: Header=BB12_5 Depth=1 s_cmp_eq_u32 s18, s12 s_cselect_b64 s[8:9], -1, 0 s_and_b64 s[24:25], s[8:9], exec s_cselect_b32 s79, s13, 0 s_cmp_eq_u32 s18, s14 s_cselect_b32 s24, s15, s77 s_add_i32 s24, s24, 63 s_ashr_i32 s25, s24, 31 s_lshr_b32 s25, s25, 26 s_add_i32 s24, s24, s25 s_ashr_i32 s78, s24, 6 s_waitcnt lgkmcnt(0) s_mul_i32 s24, s18, s43 s_mul_hi_u32 s25, s18, s42 s_add_i32 s24, s25, s24 s_mul_i32 s25, s19, s42 s_add_i32 s24, s24, s25 s_mul_i32 s25, s18, s42 s_add_u32 s25, s62, s25 s_addc_u32 s24, s63, s24 v_readfirstlane_b32 s49, v108 s_add_u32 s44, s30, s25 s_addc_u32 s45, s31, s24 s_lshl_b32 s36, s49, 6 s_ashr_i32 s25, s36, 31 s_add_u32 s24, s44, s36 s_addc_u32 s25, s45, s25 s_lshl_b32 s37, s49, 10 s_add_i32 s80, s37, 0 s_cmp_lg_u32 s80, -1 s_cselect_b32 s81, s80, 0 s_bitset1_b32 s81, 31 s_nop 0 matrix_load_64x16_b8 s[24:27] s81 t r lds s_add_i32 s82, s81, 0x1000 s_nop 0 matrix_load_64x16_b8 s[24:27] s82 moffset:256 t r lds v_or_b32_e32 v0, s36, v110 v_cmp_gt_i32_e32 vcc, 64, v0 v_add_u32_e32 v2, v111, v0 s_and_b64 vcc, vcc, s[0:1] v_cndmask_b32_e32 v0, -1, v2, vcc s_add_i32 s24, s80, 0x2000 ;;#ASMSTART s_mov_b32 m0, s24 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND s_mov_b32 m0, 0 s_nop 0 ds_read_matrix_trans_format v[5:8], m0 element:1 row:3 col:1 ds_read_matrix_trans_format v[9:12], m0 offset:1024 element:1 row:3 col:1 ds_read_matrix_trans_format v[13:16], m0 offset:2048 element:1 row:3 col:1 ds_read_matrix_trans_format v[17:20], m0 offset:3072 element:1 row:3 col:1 ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[21:24], m0 offset:4096 element:1 row:3 col:1 ds_read_matrix_trans_format v[25:28], m0 offset:5120 element:1 row:3 col:1 ds_read_matrix_trans_format v[29:32], m0 offset:6144 element:1 row:3 col:1 ds_read_matrix_trans_format v[33:36], m0 offset:7168 element:1 row:3 col:1 ;;#ASMSTART s_waitcnt vmcnt(0) s_barrier ;;#ASMEND s_mul_i32 s24, s18, s59 s_mul_hi_u32 s25, s18, s58 v_add_u32_e32 v0, 0, v112 s_add_i32 s24, s25, s24 s_mul_i32 s25, s19, s58 ds_read_b128 v[37:40], v0 offset:8192 s_add_i32 s25, s24, s25 s_mul_i32 s24, s18, s58 s_lshl_b32 s86, s49, 4 s_lshl_b64 s[24:25], s[24:25], 2 s_add_u32 s37, s10, s24 s_addc_u32 s36, s11, s25 s_add_i32 s24, s78, -1 v_lshl_or_b32 v75, s49, 12, v113 s_cmp_le_i32 s78, s79 v_mov_b32_e32 v0, 0 s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_cbranch_scc1 .LBB12_20 ; %bb.8: ; %.lr.ph.i ; in Loop: Header=BB12_5 Depth=1 s_ashr_i32 s25, s24, 31 s_lshl_b64 s[44:45], s[24:25], 2 s_add_u32 s44, s37, s44 s_addc_u32 s45, s36, s45 ;;#ASMSTART s_load_dword s25, s[44:45], 0x0 s_waitcnt lgkmcnt(0) ;;#ASMEND s_ashr_i32 s44, s25, 31 s_mul_i32 s45, s25, s57 s_mul_hi_u32 s49, s25, s56 s_add_i32 s45, s49, s45 s_mul_i32 s44, s44, s56 s_add_i32 s45, s45, s44 s_mul_i32 s25, s25, s56 s_add_u32 s44, s66, s25 s_addc_u32 s45, s67, s45 s_lshl_b32 s49, s24, 6 s_add_i32 s50, s86, s49 s_cmp_ge_i32 s50, s77 s_mov_b64 s[24:25], -1 s_cbranch_scc0 .LBB12_10 ; %bb.9: ; in Loop: Header=BB12_5 Depth=1 v_readfirstlane_b32 s24, v108 s_lshl_b32 s24, s24, 10 s_add_i32 s24, s24, 0 v_mov_b32_e32 v0, -1 s_nop 1 ;;#ASMSTART s_mov_b32 m0, s24 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s25, s24, 0x1000 ;;#ASMSTART s_mov_b32 m0, s25 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s25, s24, 0x2000 ;;#ASMSTART s_mov_b32 m0, s25 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s25, s24, 0x3000 ;;#ASMSTART s_mov_b32 m0, s25 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s25, s24, 0x4000 ;;#ASMSTART s_mov_b32 m0, s25 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s25, s24, 0x5000 ;;#ASMSTART s_mov_b32 m0, s25 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_add_i32 s25, s24, 0x6000 ;;#ASMSTART s_mov_b32 m0, s25 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_addk_i32 s24, 0x7000 ;;#ASMSTART s_mov_b32 m0, s24 buffer_load_dwordx4 v0, s[44:47], s48 ,offen offset:0, lds ;;#ASMEND s_mov_b64 s[24:25], 0 .LBB12_10: ; %Flow240 ; in Loop: Header=BB12_5 Depth=1 s_andn2_b64 vcc, exec, s[24:25] s_cbranch_vccnz .LBB12_12 ; %bb.11: ; in Loop: Header=BB12_5 Depth=1 s_ashr_i32 s24, s86, 31 s_mul_i32 s25, s86, s53 s_mul_hi_u32 s51, s86, s52 s_sub_i32 s50, s50, s77 s_add_i32 s25, s51, s25 s_mul_i32 s24, s24, s52 s_add_i32 s50, s50, 16 s_add_i32 s25, s25, s24 s_mul_i32 s24, s86, s52 s_max_i32 s50, s50, 0 s_add_u32 s88, s44, s24 s_addc_u32 s89, s45, s25 s_lshl_b32 s24, s50, 8 s_or_b32 s91, s24, 0x10000 s_mov_b32 s90, s38 s_nop 0 matrix_load_64x16_b8 s[88:91] s81 t r lds s_add_i32 s84, s81, 0x2000 matrix_load_64x16_b8 s[88:91] s82 moffset:64 t r lds s_add_i32 s83, s81, 0x3000 matrix_load_64x16_b8 s[88:91] s84 moffset:128 t r lds s_add_i32 s51, s81, 0x4000 matrix_load_64x16_b8 s[88:91] s83 moffset:192 t r lds s_add_i32 s50, s81, 0x5000 matrix_load_64x16_b8 s[88:91] s51 moffset:256 t r lds s_add_i32 s25, s81, 0x6000 matrix_load_64x16_b8 s[88:91] s50 moffset:320 t r lds s_add_i32 s24, s81, 0x7000 matrix_load_64x16_b8 s[88:91] s25 moffset:384 t r lds matrix_load_64x16_b8 s[88:91] s24 moffset:448 t r lds .LBB12_12: ; in Loop: Header=BB12_5 Depth=1 v_readfirstlane_b32 s24, v108 v_lshl_or_b32 v0, s24, 4, v109 v_mad_u64_u32 v[2:3], s[24:25], v0, s52, v[101:102] s_sub_i32 s24, s77, s49 v_cmp_gt_i32_e32 vcc, s24, v0 v_cndmask_b32_e32 v0, -1, v2, vcc buffer_load_dwordx4 v[49:52], v0, s[44:47], 0 offen s_add_i32 s25, 0, 0x4000 v_add_u32_e32 v0, s25, v75 v_add_u32_e32 v46, s70, v75 v_add_u32_e32 v61, s71, v75 v_add_u32_e32 v62, s72, v75 ;;#ASMSTART s_waitcnt vmcnt(8) s_barrier ;;#ASMEND s_mov_b32 m0, s80 s_nop 0 ds_read_matrix_trans_format v[41:44], m0 element:1 row:3 col:1 v_mov_b32_e32 v3, v1 v_mov_b32_e32 v4, v1 v_mov_b32_e32 v2, v1 v_mov_b64_e32 v[55:56], v[3:4] v_mov_b64_e32 v[53:54], v[1:2] s_waitcnt lgkmcnt(0) s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[5:6], v[41:42], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[7:8], v[43:44], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(7) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[41:44], m0 offset:4096 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[9:10], v[41:42], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[11:12], v[43:44], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(6) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[41:44], m0 offset:8192 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[13:14], v[41:42], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[15:16], v[43:44], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(5) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[41:44], m0 offset:12288 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[17:18], v[41:42], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[19:20], v[43:44], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[41:44], m0 offset:16384 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[21:22], v[41:42], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[23:24], v[43:44], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[41:44], m0 offset:20480 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[25:26], v[41:42], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[27:28], v[43:44], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[41:44], m0 offset:24576 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[29:30], v[41:42], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[31:32], v[43:44], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[57:60], m0 offset:28672 element:1 row:3 col:1 ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[41:43:45:47], v0 ;s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[42:44:46:48], v46 ds_read_m64x16_b8_alt4 v[67:69:71:73], v61 ds_read_m64x16_b8_alt4 v[72:74:76:78], v62 ; sched_barrier mask(0x00000000) s_waitcnt lgkmcnt(4) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[33:34], v[57:58], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[35:36], v[59:60], v[53:56] lit ;;#ASMSTART s_waitcnt vmcnt(0) ;;#ASMEND s_waitcnt vmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[37:38], v[49:50], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[39:40], v[51:52], v[53:56] lit ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND v_mov_b32_e32 v4, 0xff800000 v_cmp_gt_i32_e32 vcc, s24, v114 s_nop 0 v_cndmask_b32_e32 v0, v4, v53, vcc v_cmp_gt_i32_e32 vcc, s24, v119 v_cndmask_b32_e32 v2, v4, v54, vcc v_cmp_gt_i32_e32 vcc, s24, v120 v_and_b32_e32 v50, 63, v133 v_and_b32_e32 v52, 64, v133 v_cndmask_b32_e32 v3, v4, v55, vcc v_cmp_gt_i32_e32 vcc, s24, v121 v_xor_b32_e32 v51, 32, v50 v_add_u32_e32 v52, 64, v52 v_cndmask_b32_e32 v4, v4, v56, vcc v_cmp_lt_i32_e32 vcc, v51, v52 v_max_f32_e32 v49, v0, v2 v_cndmask_b32_e32 v51, v133, v51, vcc v_max3_f32 v49, v49, v3, v4 v_lshlrev_b32_e32 v51, 2, v51 ds_bpermute_b32 v51, v51, v49 v_xor_b32_e32 v50, 16, v50 v_cmp_lt_i32_e32 vcc, v50, v52 v_cndmask_b32_e32 v50, v133, v50, vcc v_lshlrev_b32_e32 v50, 2, v50 s_waitcnt lgkmcnt(0) v_max_f32_e32 v49, v49, v51 ds_bpermute_b32 v50, v50, v49 s_and_saveexec_b64 s[24:25], s[2:3] s_cbranch_execz .LBB12_14 ; %bb.13: ; in Loop: Header=BB12_5 Depth=1 s_waitcnt lgkmcnt(0) v_max_f32_e32 v49, v49, v50 ds_write_b32 v115, v49 .LBB12_14: ; in Loop: Header=BB12_5 Depth=1 s_or_b64 exec, exec, s[24:25] v_mov_b32_e32 v57, v47 v_mov_b32_e32 v66, v48 v_mov_b32_e32 v67, v67 v_mov_b32_e32 v59, v72 v_mov_b32_e32 v56, v45 v_mov_b32_e32 v55, v43 v_mov_b32_e32 v54, v41 v_mov_b32_e32 v65, v46 v_mov_b32_e32 v64, v44 v_mov_b32_e32 v63, v42 v_mov_b32_e32 v68, v69 v_mov_b32_e32 v69, v71 v_mov_b32_e32 v70, v73 v_mov_b32_e32 v60, v74 v_mov_b32_e32 v61, v76 v_mov_b32_e32 v62, v78 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[24:25], s[4:5] s_cbranch_execz .LBB12_16 ; %bb.15: ; in Loop: Header=BB12_5 Depth=1 ds_read_b128 v[41:44], v116 s_waitcnt lgkmcnt(0) v_max_f32_e32 v43, v43, v44 v_max3_f32 v41, v41, v42, v43 ds_write_b32 v117, v41 offset:256 .LBB12_16: ; %_ZN5flash7SoftmaxILi1EE21softmax_rescale_o_fp8ILb1ELb0EN4cute6TensorINS3_13array_alignedIfLm4ELm16EEENS3_6LayoutINS3_5tupleIJNS3_1CILi4EEENS9_ILi1EEESB_EEENS8_IJSB_NS9_ILi0EEESD_EEEEEEENS4_INS3_10ViewEngineINS3_8smem_ptrIfEEEENS7_INS8_IJNS9_ILi128EEEEEENS8_IJSB_EEEEEEEEEvRT1_RT2_fPDv4_f.exit.i ; in Loop: Header=BB12_5 Depth=1 s_or_b64 exec, exec, s[24:25] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v143, v117 offset:256 v_add_u32_e32 v42, s76, v75 s_add_i32 s24, s78, -2 s_waitcnt lgkmcnt(0) v_mul_f32_e32 v41, v143, v107 v_cmp_lg_f32_e32 vcc, s73, v143 v_cndmask_b32_e32 v41, 0, v41, vcc v_fma_f32 v0, v0, v107, -v41 v_fma_f32 v2, v2, v107, -v41 v_exp_f32_e32 v0, v0 v_exp_f32_e32 v73, v2 v_fma_f32 v2, v3, v107, -v41 s_nop 0 v_exp_f32_e32 v74, v2 v_fma_f32 v2, v4, v107, -v41 v_add_u32_e32 v41, 0, v118 v_exp_f32_e32 v4, v2 v_mov_b32_e32 v2, v0 v_cvt_pk_fp8_f32 v2, v2, v73, s0 v_mov_b32_e32 v3, v74 v_cvt_pk_fp8_f32 v3, v3, v4, v2 op_sel:[0,0,0,1] v_add_u32_e32 v2, 0, v75 ds_write_b32 v41, v3 offset:28672 v_add_u32_e32 v3, 0, v112 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b128 v[76:79], v3 offset:28672 v_add_u32_e32 v3, s74, v75 v_add_u32_e32 v41, s75, v75 ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[80:82:84:86], v2 # s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[81:83:85:87], v3 ds_read_m64x16_b8_alt4 v[88:90:92:94], v41 # s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[89:91:93:95], v42 s_waitcnt lgkmcnt(4) ; sched_barrier mask(0x00000000) s_mov_b32 s49, s48 s_mov_b32 s50, s48 s_mov_b32 s51, s48 v_mov_b64_e32 v[45:46], s[48:49] v_mov_b64_e32 v[47:48], s[50:51] v_mov_b64_e32 v[51:52], v[47:48] v_mov_b32_e32 v3, v63 v_mov_b64_e32 v[41:42], v[45:46] v_mov_b32_e32 v63, v55 v_mov_b64_e32 v[49:50], v[45:46] v_mov_b32_e32 v2, v54 v_mov_b64_e32 v[43:44], v[47:48] v_mmac_f32_16x16x32_fp8_fp8 v[49:52], v[76:77], v[63:64], v[49:52] lit v_mov_b32_e32 v64, v56 v_mov_b64_e32 v[55:56], v[47:48] v_mmac_f32_16x16x32_fp8_fp8 v[41:44], v[76:77], v[2:3], v[41:44] lit v_mov_b32_e32 v3, v59 v_mov_b32_e32 v59, v68 v_mov_b64_e32 v[53:54], v[45:46] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[49:52], v[78:79], v[59:60], v[49:52] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[76:77], v[64:65], v[53:56] lit v_mov_b32_e32 v60, v69 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[78:79], v[60:61], v[53:56] lit v_mov_b32_e32 v65, v57 v_mov_b64_e32 v[59:60], v[47:48] v_mov_b64_e32 v[57:58], v[45:46] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[57:60], v[76:77], v[65:66], v[57:60] lit v_mov_b32_e32 v61, v70 v_mov_b32_e32 v2, v67 s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[57:60], v[78:79], v[61:62], v[57:60] lit v_mov_b64_e32 v[63:64], v[47:48] v_mmac_f32_16x16x32_fp8_fp8 v[41:44], v[78:79], v[2:3], v[41:44] lit s_waitcnt lgkmcnt(0) v_mov_b32_e32 v2, v80 v_mov_b32_e32 v3, v81 v_mov_b64_e32 v[61:62], v[45:46] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[61:64], v[76:77], v[2:3], v[61:64] lit v_mov_b32_e32 v2, v88 s_waitcnt lgkmcnt(0) v_mov_b32_e32 v3, v89 v_mov_b64_e32 v[67:68], v[47:48] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[61:64], v[78:79], v[2:3], v[61:64] lit v_mov_b32_e32 v2, v82 v_mov_b32_e32 v3, v83 v_mov_b64_e32 v[65:66], v[45:46] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[65:68], v[76:77], v[2:3], v[65:68] lit v_mov_b32_e32 v2, v90 v_mov_b32_e32 v3, v91 v_mov_b64_e32 v[71:72], v[47:48] s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[65:68], v[78:79], v[2:3], v[65:68] lit v_mov_b32_e32 v2, v84 v_mov_b32_e32 v3, v85 v_mov_b64_e32 v[69:70], v[45:46] s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[69:72], v[76:77], v[2:3], v[69:72] lit v_mov_b32_e32 v2, v92 v_mov_b32_e32 v3, v93 s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[69:72], v[78:79], v[2:3], v[69:72] lit v_mov_b32_e32 v2, v86 v_mov_b32_e32 v3, v87 v_add_f32_e32 v0, v73, v0 s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[45:48], v[76:77], v[2:3], v[45:48] lit v_mov_b32_e32 v2, v94 v_mov_b32_e32 v3, v95 v_add_f32_e32 v0, v0, v74 s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[45:48], v[78:79], v[2:3], v[45:48] lit v_add_f32_e32 v0, v0, v4 s_cmp_lt_i32 s24, s79 s_cbranch_scc0 .LBB12_21 .LBB12_17: ; in Loop: Header=BB12_5 Depth=1 v_mov_b32_e32 v2, v143 .LBB12_18: ; %._crit_edge.i ; in Loop: Header=BB12_5 Depth=1 s_cmp_eq_u32 s79, 0 s_cselect_b64 s[24:25], -1, 0 s_add_i32 s36, s77, 63 s_ashr_i32 s37, s36, 31 s_lshr_b32 s37, s37, 26 s_add_i32 s36, s36, s37 s_ashr_i32 s36, s36, 6 s_cmp_eq_u32 s78, s36 s_cselect_b64 s[36:37], -1, 0 s_and_b64 s[24:25], s[24:25], s[36:37] s_andn2_b64 vcc, exec, s[24:25] s_mov_b64 s[24:25], -1 s_cbranch_vccnz .LBB12_28 ; %bb.19: ; %Flow235 ; in Loop: Header=BB12_5 Depth=1 s_and_b64 vcc, exec, s[24:25] s_cbranch_vccz .LBB12_4 s_branch .LBB12_37 .LBB12_20: ; in Loop: Header=BB12_5 Depth=1 s_mov_b32 s50, s48 s_mov_b32 s51, s48 s_mov_b32 s49, s48 v_mov_b64_e32 v[59:60], s[50:51] v_mov_b64_e32 v[57:58], s[48:49] v_mov_b64_e32 v[53:54], v[57:58] v_mov_b64_e32 v[49:50], v[57:58] v_mov_b64_e32 v[41:42], v[57:58] v_mov_b64_e32 v[45:46], v[57:58] v_mov_b64_e32 v[71:72], v[59:60] v_mov_b64_e32 v[67:68], v[59:60] v_mov_b64_e32 v[63:64], v[59:60] v_mov_b32_e32 v143, 0 v_mov_b64_e32 v[55:56], v[59:60] v_mov_b64_e32 v[51:52], v[59:60] v_mov_b64_e32 v[43:44], v[59:60] v_mov_b64_e32 v[47:48], v[59:60] v_mov_b64_e32 v[69:70], v[57:58] v_mov_b64_e32 v[65:66], v[57:58] v_mov_b64_e32 v[61:62], v[57:58] s_cmp_lt_i32 s24, s79 s_cbranch_scc1 .LBB12_17 .LBB12_21: ; %.lr.ph624.i ; in Loop: Header=BB12_5 Depth=1 s_add_i32 s44, 0, 0x4000 s_ashr_i32 s25, s86, 31 v_add_u32_e32 v134, s44, v75 s_mul_i32 s44, s86, s53 s_mul_hi_u32 s45, s86, s52 s_add_i32 s44, s45, s44 s_mul_i32 s25, s25, s52 s_add_i32 s87, s44, s25 s_lshl_b32 s25, s24, 6 s_sub_i32 s89, s77, s25 s_ashr_i32 s25, s24, 31 s_add_i32 s49, s81, 0x2000 s_add_i32 s50, s81, 0x3000 s_add_i32 s51, s81, 0x4000 s_add_i32 s83, s81, 0x5000 s_add_i32 s84, s81, 0x6000 s_add_i32 s85, s81, 0x7000 s_add_i32 s88, s24, 1 s_lshl_b64 s[24:25], s[24:25], 2 s_add_u32 s24, s37, s24 v_add_u32_e32 v135, s70, v75 v_add_u32_e32 v136, s71, v75 v_add_u32_e32 v137, s72, v75 v_add_u32_e32 v138, 0, v75 v_add_u32_e32 v139, s74, v75 v_add_u32_e32 v140, s75, v75 v_add_u32_e32 v141, s76, v75 s_mul_i32 s86, s86, s52 s_addc_u32 s25, s36, s25 ; implicit-def: $vgpr142 .LBB12_22: ; Parent Loop BB12_5 Depth=1 ; => This Inner Loop Header: Depth=2 s_mov_b64 s[36:37], s[24:25] ;;#ASMSTART s_load_dword s44, s[36:37], 0x0 s_waitcnt lgkmcnt(0) ;;#ASMEND s_ashr_i32 s36, s44, 31 s_mul_i32 s37, s44, s57 s_mul_hi_u32 s45, s44, s56 s_add_i32 s37, s45, s37 s_mul_i32 s36, s36, s56 s_add_i32 s37, s37, s36 s_mul_i32 s44, s44, s56 s_add_u32 s44, s66, s44 s_addc_u32 s45, s67, s37 s_add_u32 s36, s44, s86 s_addc_u32 s37, s45, s87 s_nop 0 matrix_load_64x16_b8 s[36:39] s81 t r lds matrix_load_64x16_b8 s[36:39] s82 moffset:64 t r lds v_readfirstlane_b32 s90, v108 matrix_load_64x16_b8 s[36:39] s49 moffset:128 t r lds v_lshl_or_b32 v4, s90, 4, v109 matrix_load_64x16_b8 s[36:39] s50 moffset:192 t r lds v_mad_u64_u32 v[2:3], s[90:91], v4, s52, v[101:102] matrix_load_64x16_b8 s[36:39] s51 moffset:256 t r lds matrix_load_64x16_b8 s[36:39] s83 moffset:320 t r lds matrix_load_64x16_b8 s[36:39] s84 moffset:384 t r lds v_cmp_gt_i32_e32 vcc, s89, v4 matrix_load_64x16_b8 s[36:39] s85 moffset:448 t r lds v_cndmask_b32_e32 v2, -1, v2, vcc buffer_load_dwordx4 v[85:88], v2, s[44:47], 0 offen ;;#ASMSTART s_waitcnt vmcnt(8) s_barrier ;;#ASMEND s_mov_b32 m0, s80 s_nop 0 ds_read_matrix_trans_format v[77:80], m0 element:1 row:3 col:1 v_mov_b32_e32 v3, v1 v_mov_b32_e32 v4, v1 v_mov_b32_e32 v2, v1 v_mov_b64_e32 v[75:76], v[3:4] v_mov_b64_e32 v[73:74], v[1:2] s_waitcnt lgkmcnt(0) s_nop 0 v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[5:6], v[77:78], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[7:8], v[79:80], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(7) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:4096 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[9:10], v[77:78], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[11:12], v[79:80], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(6) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:8192 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[13:14], v[77:78], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[15:16], v[79:80], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(5) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:12288 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[17:18], v[77:78], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[19:20], v[79:80], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(4) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:16384 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[21:22], v[77:78], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[23:24], v[79:80], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(3) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:20480 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[25:26], v[77:78], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[27:28], v[79:80], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(2) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[77:80], m0 offset:24576 element:1 row:3 col:1 s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[29:30], v[77:78], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[31:32], v[79:80], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(1) s_barrier ;;#ASMEND ds_read_matrix_trans_format v[89:92], m0 offset:28672 element:1 row:3 col:1 ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[77:79:81:83], v134 ;s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[78:80:82:84], v135 ds_read_m64x16_b8_alt4 v[152:154:156:158], v136 ;s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[153:155:157:159], v137 s_waitcnt lgkmcnt(4) ; sched_barrier mask(0x00000000) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[33:34], v[89:90], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[35:36], v[91:92], v[73:76] lit ;;#ASMSTART s_waitcnt vmcnt(0) ;;#ASMEND s_waitcnt vmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[37:38], v[85:86], v[73:76] lit v_mmac_f32_16x16x32_fp8_fp8 v[73:76], v[39:40], v[87:88], v[73:76] lit ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND v_mbcnt_hi_u32_b32 v3, -1, v132 v_and_b32_e32 v4, 63, v3 v_and_b32_e32 v86, 64, v3 v_xor_b32_e32 v85, 32, v4 v_add_u32_e32 v86, 64, v86 v_cmp_lt_i32_e32 vcc, v85, v86 v_max3_f32 v2, v143, v73, v74 v_cndmask_b32_e32 v85, v3, v85, vcc v_max3_f32 v2, v2, v75, v76 v_lshlrev_b32_e32 v85, 2, v85 ds_bpermute_b32 v85, v85, v2 v_xor_b32_e32 v4, 16, v4 v_cmp_lt_i32_e32 vcc, v4, v86 v_cndmask_b32_e32 v3, v3, v4, vcc v_lshlrev_b32_e32 v3, 2, v3 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v85 ds_bpermute_b32 v3, v3, v2 s_and_saveexec_b64 s[36:37], s[2:3] s_cbranch_execz .LBB12_24 ; %bb.23: ; in Loop: Header=BB12_22 Depth=2 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v2, v3 ds_write_b32 v115, v2 .LBB12_24: ; in Loop: Header=BB12_22 Depth=2 s_or_b64 exec, exec, s[36:37] # v_mov_b32_e32 v92, v83 # v_mov_b32_e32 v91, v81 # v_mov_b32_e32 v90, v79 # v_mov_b32_e32 v89, v77 # v_mov_b32_e32 v84, v84 # v_mov_b32_e32 v83, v82 # v_mov_b32_e32 v82, v80 # v_mov_b32_e32 v81, v78 # v_mov_b32_e32 v85, v93 # v_mov_b32_e32 v77, v94 # v_mov_b32_e32 v86, v95 # v_mov_b32_e32 v87, v97 # v_mov_b32_e32 v88, v99 # v_mov_b32_e32 v78, v96 # v_mov_b32_e32 v79, v98 # v_mov_b32_e32 v80, v100 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[36:37], s[4:5] s_cbranch_execz .LBB12_26 ; %bb.25: ; in Loop: Header=BB12_22 Depth=2 ds_read_b128 v[93:96], v116 s_waitcnt lgkmcnt(0) v_max_f32_e32 v2, v95, v96 v_max3_f32 v2, v93, v94, v2 ds_write_b32 v117, v2 offset:256 .LBB12_26: ; %_ZN5flash7SoftmaxILi1EE21softmax_rescale_o_fp8ILb0ELb0EN4cute6TensorINS3_13array_alignedIfLm4ELm16EEENS3_6LayoutINS3_5tupleIJNS3_1CILi4EEENS9_ILi1EEESB_EEENS8_IJSB_NS9_ILi0EEESD_EEEEEEENS4_INS3_10ViewEngineINS3_8smem_ptrIfEEEENS7_INS8_IJNS9_ILi128EEEEEENS8_IJSB_EEEEEEEEEvRT1_RT2_fPDv4_f.exit.i ; in Loop: Header=BB12_22 Depth=2 s_or_b64 exec, exec, s[36:37] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v2, v117 offset:256 s_waitcnt lgkmcnt(0) v_mul_f32_e32 v93, v2, v107 v_cmp_lg_f32_e32 vcc, s73, v2 v_sub_f32_e32 v3, v143, v2 v_cndmask_b32_e32 v93, 0, v93, vcc v_mul_f32_e32 v3, v3, v107 v_fma_f32 v73, v73, v107, -v93 v_fma_f32 v74, v74, v107, -v93 v_exp_f32_e32 v3, v3 v_exp_f32_e32 v73, v73 v_exp_f32_e32 v94, v74 v_fma_f32 v74, v75, v107, -v93 v_mov_b32_e32 v4, v3 v_exp_f32_e32 v95, v74 v_fma_f32 v74, v76, v107, -v93 v_fma_f32 v0, v3, v0, v73 v_exp_f32_e32 v151, v74 v_cvt_pk_fp8_f32 v73, v73, v94, v142 v_mov_b32_e32 v142, v95 v_add_f32_e32 v0, v0, v94 v_pk_mul_f32 v[61:62], v[3:4], v[61:62] v_cvt_pk_fp8_f32 v142, v142, v151, v73 op_sel:[0,0,0,1] v_add_u32_e32 v73, 0, v118 v_pk_mul_f32 v[63:64], v[3:4], v[63:64] v_pk_mul_f32 v[65:66], v[3:4], v[65:66] ds_write_b32 v73, v142 offset:28672 v_add_u32_e32 v73, 0, v112 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b128 v[73:76], v73 offset:28672 v_pk_mul_f32 v[67:68], v[3:4], v[67:68] v_pk_mul_f32 v[69:70], v[3:4], v[69:70] v_pk_mul_f32 v[71:72], v[3:4], v[71:72] v_pk_mul_f32 v[45:46], v[3:4], v[45:46] v_pk_mul_f32 v[47:48], v[3:4], v[47:48] v_pk_mul_f32 v[41:42], v[3:4], v[41:42] v_pk_mul_f32 v[43:44], v[3:4], v[43:44] v_pk_mul_f32 v[49:50], v[3:4], v[49:50] v_pk_mul_f32 v[51:52], v[3:4], v[51:52] v_pk_mul_f32 v[53:54], v[3:4], v[53:54] v_pk_mul_f32 v[55:56], v[3:4], v[55:56] v_pk_mul_f32 v[57:58], v[3:4], v[57:58] v_pk_mul_f32 v[59:60], v[3:4], v[59:60] v_add_f32_e32 v0, v0, v95 ; sched_barrier mask(0x00000000) ds_read_m64x16_b8_alt4 v[93:95:97:99], v138 ;s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[94:96:98:100], v139 ds_read_m64x16_b8_alt4 v[143:145:147:149], v140 ;s_waitcnt lgkmcnt(0) ds_read_m64x16_b8_alt4 v[144:146:148:150], v141 ; sched_barrier mask(0x00000000) s_waitcnt lgkmcnt(4) ;v_mov_b32_e32 v3, v89 ;v_mov_b32_e32 v4, v81 ;s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[41:44], v[73:74], v[77:78], v[41:44] lit # v_mov_b32_e32 v3, v85 # v_mov_b32_e32 v4, v77 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[41:44], v[75:76], v[152:153], v[41:44] lit v_mmac_f32_16x16x32_fp8_fp8 v[49:52], v[73:74], v[79:80], v[49:52] lit v_mmac_f32_16x16x32_fp8_fp8 v[49:52], v[75:76], v[154:155], v[49:52] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[73:74], v[81:82], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[53:56], v[75:76], v[156:157], v[53:56] lit v_mmac_f32_16x16x32_fp8_fp8 v[57:60], v[73:74], v[83:84], v[57:60] lit v_mmac_f32_16x16x32_fp8_fp8 v[57:60], v[75:76], v[158:159], v[57:60] lit s_waitcnt lgkmcnt(2) # v_mov_b32_e32 v3, v93 # v_mov_b32_e32 v4, v94 # s_nop 1 v_mmac_f32_16x16x32_fp8_fp8 v[61:64], v[73:74], v[93:94], v[61:64] lit v_mmac_f32_16x16x32_fp8_fp8 v[65:68], v[73:74], v[95:96], v[65:68] lit v_mmac_f32_16x16x32_fp8_fp8 v[69:72], v[73:74], v[97:98], v[69:72] lit v_mmac_f32_16x16x32_fp8_fp8 v[45:48], v[73:74], v[99:100], v[45:48] lit s_waitcnt lgkmcnt(0) v_mmac_f32_16x16x32_fp8_fp8 v[61:64], v[75:76], v[143:144], v[61:64] lit v_mmac_f32_16x16x32_fp8_fp8 v[65:68], v[75:76], v[145:146], v[65:68] lit v_mmac_f32_16x16x32_fp8_fp8 v[69:72], v[75:76], v[147:148], v[69:72] lit v_mmac_f32_16x16x32_fp8_fp8 v[45:48], v[75:76], v[149:150], v[45:48] lit s_add_i32 s88, s88, -1 s_add_i32 s89, s89, 64 s_add_u32 s24, s24, -4 s_addc_u32 s25, s25, -1 s_cmp_gt_i32 s88, s79 v_add_f32_e32 v0, v0, v151 s_cbranch_scc0 .LBB12_18 ; %bb.27: ; in Loop: Header=BB12_22 Depth=2 v_mov_b32_e32 v143, v2 s_branch .LBB12_22 .LBB12_28: ; in Loop: Header=BB12_5 Depth=1 s_add_u32 s24, s40, s60 s_addc_u32 s25, s41, s61 global_load_dword v5, v1, s[24:25] v_mbcnt_hi_u32_b32 v3, -1, v132 v_and_b32_e32 v4, 63, v3 v_and_b32_e32 v7, 64, v3 v_xor_b32_e32 v6, 32, v4 v_add_u32_e32 v7, 64, v7 v_cmp_lt_i32_e32 vcc, v6, v7 v_cndmask_b32_e32 v6, v3, v6, vcc v_lshlrev_b32_e32 v6, 2, v6 ds_bpermute_b32 v6, v6, v0 v_xor_b32_e32 v4, 16, v4 v_cmp_lt_i32_e32 vcc, v4, v7 v_cndmask_b32_e32 v3, v3, v4, vcc v_lshlrev_b32_e32 v4, 2, v3 s_waitcnt lgkmcnt(0) v_add_f32_e32 v3, v6, v0 ds_bpermute_b32 v4, v4, v3 s_waitcnt vmcnt(0) v_readfirstlane_b32 s36, v5 s_and_saveexec_b64 s[24:25], s[2:3] s_cbranch_execz .LBB12_30 ; %bb.29: ; in Loop: Header=BB12_5 Depth=1 s_waitcnt lgkmcnt(0) v_add_f32_e32 v3, v3, v4 ds_write_b32 v122, v3 .LBB12_30: ; in Loop: Header=BB12_5 Depth=1 s_or_b64 exec, exec, s[24:25] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[24:25], s[4:5] s_cbranch_execz .LBB12_32 ; %bb.31: ; in Loop: Header=BB12_5 Depth=1 ds_read_b128 v[3:6], v123 s_waitcnt lgkmcnt(0) v_pk_add_f32 v[3:4], v[5:6], v[3:4] v_add_f32_e32 v3, v4, v3 ds_write_b32 v124, v3 offset:256 .LBB12_32: ; %_ZN5flash7SoftmaxILi1EE29normalize_softmax_lse_fp8_tp1ILb0ELb1ELb0EN4cute6TensorINS3_10ViewEngineINS3_8smem_ptrIfEEEENS3_6LayoutINS3_5tupleIJNS3_1CILi128EEEEEENSA_IJNSB_ILi1EEEEEEEEEEEENS4_INS3_13array_alignedIfLm1ELm16EEENS9_ISF_NSA_IJNSB_ILi0EEEEEEEEEEPDv4_fRT2_fff.exit.i ; in Loop: Header=BB12_5 Depth=1 s_or_b64 exec, exec, s[24:25] s_and_b64 s[8:9], s[8:9], exec s_cselect_b32 s8, s34, 0 s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v3, v124 offset:256 s_add_i32 s8, s36, s8 s_mul_i32 s8, s8, s21 s_add_i32 s8, s8, s7 s_mul_i32 s8, s8, s35 s_add_i32 s24, s8, s6 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, 0, v3 s_and_saveexec_b64 s[36:37], s[22:23] s_cbranch_execz .LBB12_34 ; %bb.33: ; in Loop: Header=BB12_5 Depth=1 v_log_f32_e32 v4, v3 s_ashr_i32 s25, s24, 31 v_mov_b32_e32 v5, 0xff800000 s_lshl_b64 s[8:9], s[24:25], 2 v_mul_f32_e32 v4, 0x3f317218, v4 v_fmac_f32_e32 v4, v2, v106 v_cndmask_b32_e32 v6, v4, v5, vcc v_mov_b32_e32 v5, s9 v_add_co_u32_e64 v4, s[8:9], s8, v125 v_addc_co_u32_e64 v5, s[8:9], v126, v5, s[8:9] global_store_dword v[4:5], v6, off .LBB12_34: ; %.loopexit600.i ; in Loop: Header=BB12_5 Depth=1 s_or_b64 exec, exec, s[36:37] s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB12_36 ; %bb.35: ; %.preheader597.i ; in Loop: Header=BB12_5 Depth=1 v_rcp_f32_e32 v6, v3 v_mov_b32_e32 v3, v61 v_mov_b32_e32 v4, v65 s_mul_i32 s24, s24, s20 v_mul_f32_e32 v6, s33, v6 v_cndmask_b32_e64 v29, v6, 1.0, vcc v_mov_b32_e32 v30, v29 v_pk_mul_f32 v[31:32], v[29:30], v[3:4] v_mov_b32_e32 v3, v63 v_mov_b32_e32 v4, v67 v_mov_b32_e32 v17, v42 v_mov_b32_e32 v18, v50 s_ashr_i32 s25, s24, 31 v_pk_mul_f32 v[9:10], v[29:30], v[3:4] v_mov_b32_e32 v3, v64 v_mov_b32_e32 v4, v68 v_pk_mul_f32 v[19:20], v[29:30], v[17:18] v_mov_b32_e32 v17, v43 v_mov_b32_e32 v18, v51 s_lshl_b64 s[24:25], s[24:25], 2 v_pk_mul_f32 v[13:14], v[29:30], v[3:4] v_mov_b32_e32 v3, v69 v_mov_b32_e32 v4, v45 v_mov_b32_e32 v15, v72 v_mov_b32_e32 v16, v48 v_pk_mul_f32 v[23:24], v[29:30], v[17:18] v_mov_b32_e32 v17, v44 v_mov_b32_e32 v18, v52 s_add_u32 s24, s54, s24 v_mov_b32_e32 v5, v62 v_mov_b32_e32 v6, v66 v_pk_mul_f32 v[3:4], v[29:30], v[3:4] v_mov_b32_e32 v7, v70 v_mov_b32_e32 v8, v46 v_mov_b32_e32 v11, v71 v_mov_b32_e32 v12, v47 v_pk_mul_f32 v[33:34], v[29:30], v[15:16] v_mov_b32_e32 v15, v41 v_mov_b32_e32 v16, v49 v_pk_mul_f32 v[27:28], v[29:30], v[17:18] v_mov_b32_e32 v17, v53 v_mov_b32_e32 v18, v57 v_mov_b32_e32 v21, v54 v_mov_b32_e32 v22, v58 v_mov_b32_e32 v25, v55 v_mov_b32_e32 v26, v59 v_mov_b32_e32 v35, v56 v_mov_b32_e32 v36, v60 s_addc_u32 s25, s55, s25 v_pk_mul_f32 v[5:6], v[29:30], v[5:6] v_pk_mul_f32 v[7:8], v[29:30], v[7:8] v_pk_mul_f32 v[11:12], v[29:30], v[11:12] v_pk_mul_f32 v[15:16], v[29:30], v[15:16] v_pk_mul_f32 v[17:18], v[29:30], v[17:18] v_pk_mul_f32 v[21:22], v[29:30], v[21:22] v_pk_mul_f32 v[25:26], v[29:30], v[25:26] v_pk_mul_f32 v[29:30], v[29:30], v[35:36] global_store_dwordx2 v129, v[31:32], s[24:25] global_store_dwordx4 v130, v[3:6], s[24:25] offset:8 global_store_dwordx4 v130, v[7:10], s[24:25] offset:24 global_store_dwordx4 v130, v[11:14], s[24:25] offset:40 global_store_dwordx2 v130, v[33:34], s[24:25] offset:56 global_store_dwordx4 v130, v[15:18], s[24:25] offset:1024 global_store_dwordx4 v131, v[19:22], s[24:25] offset:16 global_store_dwordx4 v131, v[23:26], s[24:25] offset:32 global_store_dwordx4 v131, v[27:30], s[24:25] offset:48 .LBB12_36: ; %Flow ; in Loop: Header=BB12_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_mov_b64 s[24:25], 0 s_branch .LBB12_4 .LBB12_37: ; in Loop: Header=BB12_5 Depth=1 v_mbcnt_hi_u32_b32 v3, -1, v132 v_and_b32_e32 v4, 63, v3 v_and_b32_e32 v6, 64, v3 v_xor_b32_e32 v5, 32, v4 v_add_u32_e32 v6, 64, v6 v_cmp_lt_i32_e32 vcc, v5, v6 v_cndmask_b32_e32 v5, v3, v5, vcc v_lshlrev_b32_e32 v5, 2, v5 ds_bpermute_b32 v5, v5, v0 v_xor_b32_e32 v4, 16, v4 v_cmp_lt_i32_e32 vcc, v4, v6 v_cndmask_b32_e32 v3, v3, v4, vcc v_lshlrev_b32_e32 v3, 2, v3 s_waitcnt lgkmcnt(0) v_add_f32_e32 v0, v5, v0 ds_bpermute_b32 v3, v3, v0 s_and_saveexec_b64 s[8:9], s[2:3] s_cbranch_execz .LBB12_39 ; %bb.38: ; in Loop: Header=BB12_5 Depth=1 s_waitcnt lgkmcnt(0) v_add_f32_e32 v0, v0, v3 ds_write_b32 v122, v0 .LBB12_39: ; in Loop: Header=BB12_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_waitcnt vmcnt(0) lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) s_and_saveexec_b64 s[8:9], s[4:5] s_cbranch_execz .LBB12_41 ; %bb.40: ; in Loop: Header=BB12_5 Depth=1 ds_read_b128 v[3:6], v123 s_waitcnt lgkmcnt(0) v_pk_add_f32 v[3:4], v[5:6], v[3:4] v_add_f32_e32 v0, v4, v3 ds_write_b32 v124, v0 offset:256 .LBB12_41: ; %_ZN5flash7SoftmaxILi1EE25normalize_softmax_lse_fp8ILb0ELb0EN4cute6TensorINS3_10ViewEngineINS3_8smem_ptrIfEEEENS3_6LayoutINS3_5tupleIJNS3_1CILi128EEEEEENSA_IJNSB_ILi1EEEEEEEEEEEENS4_INS3_13array_alignedIfLm1ELm16EEENS9_ISF_NSA_IJNSB_ILi0EEEEEEEEEEPDv4_fRT1_fff.exit.i ; in Loop: Header=BB12_5 Depth=1 s_or_b64 exec, exec, s[8:9] s_waitcnt lgkmcnt(0) s_barrier s_waitcnt lgkmcnt(0) ds_read_b32 v0, v124 offset:256 s_waitcnt lgkmcnt(0) v_cmp_eq_f32_e32 vcc, 0, v0 s_and_saveexec_b64 s[24:25], s[22:23] s_cbranch_execz .LBB12_43 ; %bb.42: ; in Loop: Header=BB12_5 Depth=1 s_mul_i32 s8, s18, s21 v_log_f32_e32 v3, v0 s_add_i32 s8, s8, s7 s_mul_i32 s8, s8, s35 s_add_i32 s8, s8, s6 s_ashr_i32 s9, s8, 31 v_mul_f32_e32 v3, 0x3f317218, v3 v_fmac_f32_e32 v3, v2, v106 v_mov_b32_e32 v2, 0x7f800000 s_lshl_b64 s[8:9], s[8:9], 2 v_cndmask_b32_e32 v4, v3, v2, vcc v_mov_b32_e32 v3, s9 v_add_co_u32_e64 v2, s[8:9], s8, v127 v_addc_co_u32_e64 v3, s[8:9], v128, v3, s[8:9] global_store_dword v[2:3], v4, off .LBB12_43: ; %.loopexit.i ; in Loop: Header=BB12_5 Depth=1 s_or_b64 exec, exec, s[24:25] s_and_saveexec_b64 s[8:9], s[0:1] s_cbranch_execz .LBB12_3 ; %bb.44: ; %.preheader.i ; in Loop: Header=BB12_5 Depth=1 v_rcp_f32_e32 v0, v0 s_mul_i32 s24, s18, s65 s_mul_hi_u32 s25, s18, s64 s_add_i32 s24, s25, s24 s_mul_i32 s19, s19, s64 s_add_i32 s19, s24, s19 s_mul_i32 s24, s18, s64 v_mul_f32_e32 v0, s33, v0 s_add_u32 s24, s68, s24 v_cndmask_b32_e64 v0, v0, 1.0, vcc s_addc_u32 s25, s69, s19 v_mul_f32_e32 v2, v0, v61 v_mul_f32_e32 v4, v0, v62 v_mul_f32_e32 v3, v0, v65 v_mul_f32_e32 v5, v0, v66 v_mul_f32_e32 v10, v0, v69 v_mul_f32_e32 v11, v0, v70 v_mul_f32_e32 v14, v0, v45 v_mul_f32_e32 v15, v0, v46 s_lshl_b64 s[24:25], s[24:25], 1 v_mul_f32_e32 v6, v0, v63 v_mul_f32_e32 v8, v0, v64 v_mul_f32_e32 v7, v0, v67 v_mul_f32_e32 v9, v0, v68 v_mul_f32_e32 v12, v0, v71 v_mul_f32_e32 v16, v0, v47 s_add_u32 s19, s16, s24 v_cvt_pk_bf16_f32 v2, v2, v3 v_cvt_pk_bf16_f32 v3, v10, v14 v_cvt_pk_bf16_f32 v4, v4, v5 v_cvt_pk_bf16_f32 v5, v11, v15 v_mul_f32_e32 v13, v0, v72 v_mul_f32_e32 v17, v0, v48 s_addc_u32 s24, s17, s25 v_cvt_pk_bf16_f32 v6, v6, v7 v_cvt_pk_bf16_f32 v7, v12, v16 v_cvt_pk_bf16_f32 v8, v8, v9 v_mov_b32_e32 v14, s24 v_cvt_pk_bf16_f32 v15, v13, v17 v_add_co_u32_e32 v10, vcc, s19, v102 v_mul_f32_e32 v18, v0, v41 v_mul_f32_e32 v19, v0, v42 v_mul_f32_e32 v22, v0, v49 v_mul_f32_e32 v23, v0, v50 v_mul_f32_e32 v26, v0, v53 v_mul_f32_e32 v30, v0, v57 v_addc_co_u32_e32 v11, vcc, v14, v103, vcc v_mul_f32_e32 v27, v0, v54 v_mul_f32_e32 v31, v0, v58 global_store_dwordx4 v[10:11], v[2:5], off global_store_dwordx3 v[10:11], v[6:8], off offset:16 global_store_short v[10:11], v15, off offset:28 v_cvt_pk_bf16_f32 v2, v18, v22 v_cvt_pk_bf16_f32 v3, v26, v30 v_cvt_pk_bf16_f32 v4, v19, v23 v_mul_f32_e32 v20, v0, v43 v_mul_f32_e32 v21, v0, v44 v_mul_f32_e32 v24, v0, v51 v_mul_f32_e32 v25, v0, v52 v_mul_f32_e32 v28, v0, v55 v_mul_f32_e32 v29, v0, v56 v_mul_f32_e32 v32, v0, v59 v_mul_f32_e32 v0, v0, v60 v_cvt_pk_bf16_f32 v5, v27, v31 v_mov_b32_e32 v13, s24 v_cvt_pk_bf16_f32 v6, v20, v24 v_cvt_pk_bf16_f32 v7, v28, v32 v_cvt_pk_bf16_f32 v8, v21, v25 v_cvt_pk_bf16_f32 v9, v29, v0 v_add_co_u32_e32 v12, vcc, s19, v104 v_addc_co_u32_e32 v13, vcc, v13, v105, vcc global_store_short_d16_hi v[12:13], v15, off global_store_dwordx4 v[10:11], v[2:5], off offset:512 global_store_dwordx4 v[10:11], v[6:9], off offset:528 s_branch .LBB12_3 .LBB12_45: ; %.loopexit s_endpgm .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params .amdhsa_group_segment_fixed_size 0 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 336 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 1 .amdhsa_system_sgpr_workgroup_id_z 1 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 160 .amdhsa_next_free_sgpr 92 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_bf16_overflow 0 .amdhsa_bf8_overflow 1 .amdhsa_fp8_overflow 1 .amdhsa_bf16_denorm_mode 3 .amdhsa_load_store_out_of_order 0 .amdhsa_matrix_excp 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params,#alloc,#execinstr .Lfunc_end12: .size _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params, .Lfunc_end12-_ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 7172 ; NumSgprs: 96 ; NumVgprs: 152 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 0 bytes/workgroup (compile time only) ; SGPRBlocks: 11 ; VGPRBlocks: 37 ; NumSGPRsForWavesPerEU: 96 ; NumVGPRsForWavesPerEU: 152 ; Occupancy: 1 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF16_OVFL: 0 ; COMPUTE_PGM_RSRC3_GFX938:BF8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:FP8_OVFL: 1 ; COMPUTE_PGM_RSRC3_GFX938:BF16_DENORM: 3 .ident "clang version 17.0.0 (http://10.4.0.1/dcutoolkit/driverruntime/llvm-project.git a12c71693a404295bcff3a05ea06fbbba30c4a32)" .section ".note.GNU-stack" .addrsig .addrsig_sym _ZN5flash13shared_memoryE .amdgpu_metadata --- amdhsa.kernels: - .args: - .offset: 0 .size: 336 .value_kind: by_value .fp64_status: 0 .group_segment_fixed_size: 0 .kernarg_segment_align: 8 .kernarg_segment_size: 336 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 512 .name: _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params .private_segment_fixed_size: 0 .sgpr_count: 100 .sgpr_spill_count: 2 .symbol: _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params.kd .uniform_work_group_size: 1 .uses_dynamic_stack: false .vgpr_count: 232 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .offset: 0 .size: 336 .value_kind: by_value .fp64_status: 0 .group_segment_fixed_size: 0 .kernarg_segment_align: 8 .kernarg_segment_size: 336 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 512 .name: _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params .private_segment_fixed_size: 0 .sgpr_count: 94 .sgpr_spill_count: 0 .symbol: _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp1I38Flash_fwd_kernel_traits_mla_qkvfp8_TP1ILi576ELi64ELi64ELi8EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP1IS5_EEEEv20Flash_fwd_mla_params.kd .uniform_work_group_size: 1 .uses_dynamic_stack: false .vgpr_count: 223 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .offset: 0 .size: 336 .value_kind: by_value .fp64_status: 0 .group_segment_fixed_size: 128 .kernarg_segment_align: 8 .kernarg_segment_size: 336 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 128 .name: _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi32ELi128EEEv20Flash_fwd_mla_params .private_segment_fixed_size: 0 .sgpr_count: 36 .sgpr_spill_count: 0 .symbol: _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi32ELi128EEEv20Flash_fwd_mla_params.kd .uniform_work_group_size: 1 .uses_dynamic_stack: false .vgpr_count: 52 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .offset: 0 .size: 336 .value_kind: by_value .fp64_status: 0 .group_segment_fixed_size: 256 .kernarg_segment_align: 8 .kernarg_segment_size: 336 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 128 .name: _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi64ELi128EEEv20Flash_fwd_mla_params .private_segment_fixed_size: 0 .sgpr_count: 36 .sgpr_spill_count: 0 .symbol: _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi64ELi128EEEv20Flash_fwd_mla_params.kd .uniform_work_group_size: 1 .uses_dynamic_stack: false .vgpr_count: 52 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .offset: 0 .size: 336 .value_kind: by_value .fp64_status: 0 .group_segment_fixed_size: 288 .kernarg_segment_align: 8 .kernarg_segment_size: 336 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 128 .name: _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi72ELi128EEEv20Flash_fwd_mla_params .private_segment_fixed_size: 0 .sgpr_count: 36 .sgpr_spill_count: 0 .symbol: _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi72ELi128EEEv20Flash_fwd_mla_params.kd .uniform_work_group_size: 1 .uses_dynamic_stack: false .vgpr_count: 52 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .offset: 0 .size: 336 .value_kind: by_value .fp64_status: 0 .group_segment_fixed_size: 384 .kernarg_segment_align: 8 .kernarg_segment_size: 336 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 128 .name: _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi96ELi128EEEv20Flash_fwd_mla_params .private_segment_fixed_size: 0 .sgpr_count: 36 .sgpr_spill_count: 0 .symbol: _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi96ELi128EEEv20Flash_fwd_mla_params.kd .uniform_work_group_size: 1 .uses_dynamic_stack: false .vgpr_count: 52 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .offset: 0 .size: 336 .value_kind: by_value .fp64_status: 0 .group_segment_fixed_size: 512 .kernarg_segment_align: 8 .kernarg_segment_size: 336 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 128 .name: _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi128ELi128EEEv20Flash_fwd_mla_params .private_segment_fixed_size: 0 .sgpr_count: 36 .sgpr_spill_count: 0 .symbol: _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi128ELi128EEEv20Flash_fwd_mla_params.kd .uniform_work_group_size: 1 .uses_dynamic_stack: false .vgpr_count: 52 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .offset: 0 .size: 336 .value_kind: by_value .fp64_status: 0 .group_segment_fixed_size: 576 .kernarg_segment_align: 8 .kernarg_segment_size: 336 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 128 .name: _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi144ELi128EEEv20Flash_fwd_mla_params .private_segment_fixed_size: 0 .sgpr_count: 39 .sgpr_spill_count: 0 .symbol: _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi144ELi128EEEv20Flash_fwd_mla_params.kd .uniform_work_group_size: 1 .uses_dynamic_stack: false .vgpr_count: 52 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .offset: 0 .size: 336 .value_kind: by_value .fp64_status: 0 .group_segment_fixed_size: 640 .kernarg_segment_align: 8 .kernarg_segment_size: 336 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 128 .name: _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi160ELi128EEEv20Flash_fwd_mla_params .private_segment_fixed_size: 0 .sgpr_count: 39 .sgpr_spill_count: 0 .symbol: _ZN5flash36flash_fwd_splitkv_mla_combine_kernelIN7cutlass10bfloat16_tEflLi512ELi160ELi128EEEv20Flash_fwd_mla_params.kd .uniform_work_group_size: 1 .uses_dynamic_stack: false .vgpr_count: 52 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .offset: 0 .size: 336 .value_kind: by_value .fp64_status: 0 .group_segment_fixed_size: 0 .kernarg_segment_align: 8 .kernarg_segment_size: 336 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 256 .name: _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params .private_segment_fixed_size: 0 .sgpr_count: 84 .sgpr_spill_count: 0 .symbol: _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params.kd .uniform_work_group_size: 1 .uses_dynamic_stack: false .vgpr_count: 251 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .offset: 0 .size: 336 .value_kind: by_value .fp64_status: 0 .group_segment_fixed_size: 0 .kernarg_segment_align: 8 .kernarg_segment_size: 336 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 256 .name: _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params .private_segment_fixed_size: 0 .sgpr_count: 77 .sgpr_spill_count: 0 .symbol: _ZN5flash36flash_fwd_splitkv_mla_kernel_fp8_tp4I38Flash_fwd_kernel_traits_mla_qkvfp8_TP4ILi576ELi32ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_26SharedStorageMLAFloat8_TP4IS5_EEEEv20Flash_fwd_mla_params.kd .uniform_work_group_size: 1 .uses_dynamic_stack: false .vgpr_count: 250 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .offset: 0 .size: 336 .value_kind: by_value .fp64_status: 0 .group_segment_fixed_size: 0 .kernarg_segment_align: 8 .kernarg_segment_size: 336 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 256 .name: _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params .private_segment_fixed_size: 0 .sgpr_count: 100 .sgpr_spill_count: 4 .symbol: _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb1ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params.kd .uniform_work_group_size: 1 .uses_dynamic_stack: false .vgpr_count: 159 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .offset: 0 .size: 336 .value_kind: by_value .fp64_status: 0 .group_segment_fixed_size: 0 .kernarg_segment_align: 8 .kernarg_segment_size: 336 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 256 .name: _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params .private_segment_fixed_size: 0 .sgpr_count: 96 .sgpr_spill_count: 0 .symbol: _ZN5flash32flash_fwd_splitkv_mla_kernel_fp8I34Flash_fwd_kernel_traits_mla_qkvfp8ILi576ELi16ELi64ELi4EN7cutlass12float_e4m3_tENS2_10bfloat16_tELi512EELb0ENS_22SharedStorageMLAFloat8IS5_EEEEv20Flash_fwd_mla_params.kd .uniform_work_group_size: 1 .uses_dynamic_stack: false .vgpr_count: 160 .vgpr_spill_count: 0 .wavefront_size: 64 amdhsa.target: 'amdgcn-amd-amdhsa--gfx938:sramecc+' amdhsa.version: - 1 - 2 ... .end_amdgpu_metadata