.text .amdgcn_target "amdgcn-amd-amdhsa--gfx908" .section .text._ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_,#alloc,#execinstr .protected _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ ; -- Begin function _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .globl _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .p2align 8 .type _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_,@function _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_: ; @_ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ ; %bb.0: s_load_dwordx2 s[12:13], s[4:5], 0x0 s_load_dwordx2 s[16:17], s[4:5], 0x8 s_load_dwordx2 s[8:9], s[4:5], 0x10 s_load_dwordx2 s[2:3], s[4:5], 0x24 s_load_dword s44, s[4:5], 0x48 s_load_dword s10, s[4:5], 0x50 s_load_dword s11, s[4:5], 0x58 s_load_dwordx2 s[40:41], s[4:5], 0x6c s_load_dword s7, s[4:5], 0x84 s_load_dwordx4 s[20:23], s[4:5], 0x1e0 s_load_dwordx4 s[24:27], s[4:5], 0x1f4 s_load_dwordx4 s[36:39], s[4:5], 0x208 s_load_dwordx2 s[28:29], s[4:5], 0x120 s_load_dwordx2 s[30:31], s[4:5], 0x12c s_load_dwordx2 s[0:1], s[4:5], 0x13c s_load_dwordx2 s[18:19], s[4:5], 0x148 s_load_dword s34, s[4:5], 0x1d4 s_waitcnt lgkmcnt(0) s_mul_hi_u32 s14, s27, s6 s_add_i32 s14, s6, s14 s_lshr_b32 s14, s14, s39 s_mul_i32 s15, s14, s23 s_sub_i32 s6, s6, s15 s_mul_hi_u32 s15, s14, s26 s_add_i32 s15, s14, s15 s_lshr_b32 s23, s15, s38 s_mul_i32 s15, s23, s22 s_sub_i32 s14, s14, s15 s_mul_hi_u32 s15, s23, s25 s_add_i32 s15, s23, s15 s_lshr_b32 s15, s15, s37 s_mul_hi_u32 s22, s15, s24 s_add_i32 s22, s15, s22 s_lshr_b32 s22, s22, s36 v_lshrrev_b32_e32 v1, 5, v0 v_lshrrev_b32_e32 v29, 7, v0 v_mad_i32_i24 v30, v29, -4, v1 s_mul_i32 s24, s22, s41 v_add_u32_e32 v58, s24, v30 v_mul_hi_u32 v2, v58, s10 s_load_dword s25, s[4:5], 0x1c4 s_load_dword s33, s[4:5], 0x1b0 s_mul_i32 s20, s22, s20 v_add_u32_e32 v2, v58, v2 v_lshrrev_b32_e32 v31, s11, v2 v_mul_lo_u32 v2, v31, s44 v_lshlrev_b32_e32 v37, 2, v29 s_mul_i32 s21, s15, s21 s_sub_i32 s15, s15, s20 v_sub_u32_e32 v38, v58, v2 v_lshl_or_b32 v2, v31, 3, v37 s_waitcnt lgkmcnt(0) s_mul_i32 s42, s15, s25 s_movk_i32 s15, 0xffe0 v_mul_lo_u32 v2, v2, s2 v_mul_lo_u32 v3, v38, s3 s_add_i32 s42, s42, s14 v_mad_i32_i24 v39, v1, s15, v0 s_lshl_b32 s14, s42, 8 v_lshlrev_b32_e32 v17, 3, v39 v_add_u32_e32 v1, s14, v17 v_add3_u32 v1, v1, v2, v3 s_lshl_b32 s14, s7, 1 s_mov_b32 s15, 0x20000 v_lshlrev_b32_e32 v9, 1, v1 v_add_u32_e32 v10, s2, v1 v_lshlrev_b32_e32 v11, 1, v10 buffer_load_dwordx4 v[1:4], v9, s[12:15], 0 offen buffer_load_dwordx4 v[5:8], v11, s[12:15], 0 offen v_add_u32_e32 v9, s2, v10 v_lshlrev_b32_e32 v18, 1, v9 v_add_u32_e32 v40, s2, v9 v_lshlrev_b32_e32 v19, 1, v40 buffer_load_dwordx4 v[9:12], v18, s[12:15], 0 offen buffer_load_dwordx4 v[13:16], v19, s[12:15], 0 offen s_sub_i32 s7, s23, s21 s_mul_i32 s7, s7, s34 s_add_i32 s6, s6, s7 s_lshl_b32 s43, s6, 8 s_load_dwordx2 s[20:21], s[4:5], 0x154 s_load_dword s6, s[4:5], 0x16c v_add_u32_e32 v17, s43, v17 v_mul_hi_u32 v18, v17, s19 s_load_dword s19, s[4:5], 0x180 s_load_dword s41, s[4:5], 0x18c v_accvgpr_write_b32 a240, 0 s_waitcnt lgkmcnt(0) s_mul_i32 s22, s22, s6 v_add_u32_e32 v57, s22, v30 v_mul_hi_u32 v20, v57, s29 v_add_u32_e32 v18, v17, v18 v_lshrrev_b32_e32 v18, s21, v18 v_mul_hi_u32 v19, v18, s18 v_add_u32_e32 v20, v57, v20 v_lshrrev_b32_e32 v20, s31, v20 s_load_dwordx2 s[6:7], s[4:5], 0xbc s_load_dwordx2 s[34:35], s[4:5], 0xd4 s_load_dwordx2 s[36:37], s[4:5], 0xe4 s_load_dwordx2 s[38:39], s[4:5], 0x114 v_mul_hi_u32 v21, v20, s28 v_add_u32_e32 v19, v18, v19 v_lshrrev_b32_e32 v19, s20, v19 v_mul_lo_u32 v22, v19, s0 v_add_u32_e32 v21, v20, v21 s_waitcnt lgkmcnt(0) v_mul_lo_u32 v23, v20, s39 v_lshrrev_b32_e32 v41, s30, v21 v_mul_lo_u32 v21, v41, s38 v_sub_u32_e32 v22, v18, v22 v_sub_u32_e32 v44, v57, v23 s_load_dwordx4 s[20:23], s[4:5], 0x98 s_load_dwordx4 s[24:27], s[4:5], 0xac v_sub_u32_e32 v46, v20, v21 v_mul_lo_u32 v20, v22, s36 v_mul_lo_u32 v21, v44, s37 v_mul_lo_u32 v19, v19, s34 v_mul_lo_u32 v22, v46, s35 v_mul_lo_u32 v18, v18, s1 v_add_u32_e32 v52, v21, v20 v_lshl_or_b32 v23, v41, 3, v37 v_add_u32_e32 v53, v22, v19 v_subrev_u32_e32 v19, s6, v52 s_waitcnt lgkmcnt(0) v_subrev_u32_e32 v20, s25, v53 v_mul_lo_u32 v19, v19, s22 s_sub_i32 s5, s27, s7 v_mul_lo_u32 v21, v23, s20 v_mul_lo_u32 v20, v20, s21 v_cmp_le_i32_e32 vcc, s6, v52 v_cmp_gt_i32_e64 s[0:1], s5, v52 s_sub_i32 s7, s24, s26 s_and_b64 s[46:47], vcc, s[0:1] v_cmp_le_i32_e32 vcc, s25, v53 v_cmp_gt_i32_e64 s[0:1], s7, v53 v_sub_u32_e32 v17, v17, v18 s_and_b64 s[0:1], vcc, s[0:1] s_brev_b32 s23, -2 v_add_u32_e32 v17, v17, v19 v_mov_b32_e32 v54, s23 s_and_b64 s[0:1], s[46:47], s[0:1] v_add3_u32 v17, v17, v21, v20 v_cndmask_b32_e64 v25, v54, 0, s[0:1] s_lshl_b32 s18, s19, 1 s_mov_b32 s19, s15 v_lshl_add_u32 v26, v17, 1, v25 v_add_u32_e32 v27, s20, v17 v_lshl_add_u32 v28, v27, 1, v25 buffer_load_dwordx4 v[17:20], v26, s[16:19], 0 offen buffer_load_dwordx4 v[21:24], v28, s[16:19], 0 offen v_add_u32_e32 v26, s20, v27 v_lshl_add_u32 v42, v26, 1, v25 v_add_u32_e32 v55, s20, v26 v_lshl_add_u32 v43, v55, 1, v25 buffer_load_dwordx4 v[25:28], v42, s[16:19], 0 offen buffer_load_dwordx4 v[33:36], v43, s[16:19], 0 offen s_movk_i32 s0, 0x880 s_waitcnt vmcnt(6) ;;#ASMSTART v_pack_b32_f16 v48, v1, v5 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v50, v1, v5, op_sel:[1, 1] ;;#ASMEND s_waitcnt vmcnt(4) ;;#ASMSTART v_pack_b32_f16 v49, v9, v13 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v51, v9, v13, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v1, v2, v6 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v5, v2, v6, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v2, v10, v14 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v6, v10, v14, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v9, v3, v7 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v13, v3, v7, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v10, v11, v15 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v14, v11, v15, op_sel:[1, 1] ;;#ASMEND v_mul_lo_u32 v11, v30, s0 ;;#ASMSTART v_pack_b32_f16 v3, v4, v8 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v7, v4, v8, op_sel:[1, 1] ;;#ASMEND s_movk_i32 s24, 0x44 v_and_b32_e32 v4, 63, v0 v_and_b32_e32 v8, 32, v0 v_mul_lo_u32 v15, v39, s24 v_sub_u32_e32 v39, v4, v8 v_lshrrev_b32_e32 v4, 4, v0 v_or_b32_e32 v11, v11, v37 v_and_b32_e32 v37, 2, v4 ;;#ASMSTART v_pack_b32_f16 v4, v12, v16 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v8, v12, v16, op_sel:[1, 1] ;;#ASMEND v_add_u32_e32 v12, 4, v58 v_mul_hi_u32 v16, v12, s10 v_lshlrev_b32_e32 v30, 5, v29 v_add_u32_e32 v56, v39, v30 v_ashrrev_i16_e32 v42, 15, v56 v_lshrrev_b16_e32 v42, 13, v42 v_add_u32_e32 v16, v12, v16 v_add_u16_e32 v59, v56, v42 v_lshrrev_b32_e32 v42, s11, v16 v_mul_lo_u32 v16, v42, s44 v_mul_u32_u24_e32 v60, s0, v37 v_lshrrev_b32_e32 v37, 6, v0 v_mad_i32_i24 v45, v29, -2, v37 v_add_u32_e32 v37, 4, v57 v_sub_u32_e32 v43, v12, v16 v_sub_u32_e32 v16, v42, v31 v_mul_hi_u32 v31, v37, s29 v_sub_u32_e32 v12, v43, v38 v_lshl_add_u32 v16, v16, 3, -3 v_mul_lo_u32 v16, v16, s2 v_add_u32_e32 v31, v37, v31 v_lshrrev_b32_e32 v38, s31, v31 v_mul_lo_u32 v12, v12, s3 v_mul_hi_u32 v47, v38, s28 v_lshl_add_u32 v31, v45, 5, v39 v_ashrrev_i32_e32 v39, 31, v31 v_add3_u32 v61, v12, v16, v40 v_add_u32_e32 v16, v38, v47 v_mul_lo_u32 v12, v38, s39 v_lshrrev_b32_e32 v45, s30, v16 v_mul_lo_u32 v16, v45, s38 v_lshrrev_b32_e32 v39, 29, v39 v_sub_u32_e32 v47, v37, v12 v_sub_u32_e32 v12, v47, v44 v_sub_u32_e32 v44, v38, v16 v_sub_u32_e32 v16, v44, v46 v_mul_lo_u32 v12, v12, s37 v_mul_lo_u32 v16, v16, s35 v_sub_u32_e32 v38, v45, v41 v_lshl_add_u32 v38, v38, 3, -3 v_mul_lo_u32 v38, v38, s20 v_mul_lo_u32 v40, v12, s22 v_mul_lo_u32 v41, v16, s21 v_add_u32_e32 v39, v31, v39 v_ashrrev_i32_e32 v46, 3, v39 v_add_u32_e32 v38, v38, v40 v_add3_u32 v41, v38, v41, v55 v_mul_lo_u32 v38, v46, s24 v_and_b32_e32 v39, -8, v39 v_sub_u32_e32 v62, v31, v39 v_lshlrev_b32_e32 v39, 3, v62 v_add3_u32 v63, v38, v60, v39 v_ashrrev_i16_e32 v38, 3, v59 v_bfe_i32 v64, v38, 0, 16 v_and_b32_e32 v38, -8, v59 v_sub_u16_e32 v38, v56, v38 v_bfe_i32 v59, v38, 0, 16 v_add_u32_e32 v38, v12, v52 v_add_u32_e32 v39, v16, v53 v_add_lshl_u32 v40, v11, v15, 1 v_cmp_le_i32_e32 vcc, s6, v38 v_cmp_gt_i32_e64 s[0:1], s5, v38 ds_write2_b64 v40, v[48:49], v[50:51] offset1:2 ds_write2_b64 v40, v[1:2], v[5:6] offset0:4 offset1:6 ds_write2_b64 v40, v[9:10], v[13:14] offset0:8 offset1:10 ds_write2_b64 v40, v[3:4], v[7:8] offset0:12 offset1:14 s_waitcnt vmcnt(2) ;;#ASMSTART v_pack_b32_f16 v1, v17, v21 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v3, v17, v21, op_sel:[1, 1] ;;#ASMEND s_waitcnt vmcnt(0) ;;#ASMSTART v_pack_b32_f16 v2, v25, v33 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v4, v25, v33, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v5, v18, v22 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v7, v18, v22, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v6, v26, v34 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v8, v26, v34, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v9, v19, v23 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v11, v19, v23, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v10, v27, v35 ;;#ASMEND v_add_u32_e32 v17, 0x4000, v40 s_and_b64 s[26:27], vcc, s[0:1] v_cmp_le_i32_e32 vcc, s25, v39 v_cmp_gt_i32_e64 s[0:1], s7, v39 ;;#ASMSTART v_pack_b32_f16 v12, v27, v35, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v13, v20, v24 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v15, v20, v24, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v14, v28, v36 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v16, v28, v36, op_sel:[1, 1] ;;#ASMEND ds_write2_b64 v17, v[1:2], v[3:4] offset0:128 offset1:130 ds_write2_b64 v17, v[5:6], v[7:8] offset0:132 offset1:134 ds_write2_b64 v17, v[9:10], v[11:12] offset0:136 offset1:138 ds_write2_b64 v17, v[13:14], v[15:16] offset0:140 offset1:142 v_lshlrev_b32_e32 v9, 1, v61 v_add_u32_e32 v10, s2, v61 s_and_b64 s[0:1], vcc, s[0:1] v_lshlrev_b32_e32 v11, 1, v10 buffer_load_dwordx4 v[1:4], v9, s[12:15], 0 offen buffer_load_dwordx4 v[5:8], v11, s[12:15], 0 offen v_add_u32_e32 v9, s2, v10 s_and_b64 s[0:1], s[26:27], s[0:1] v_add_u32_e32 v49, s2, v9 v_cndmask_b32_e64 v25, v54, 0, s[0:1] v_lshlrev_b32_e32 v17, 1, v9 v_lshlrev_b32_e32 v18, 1, v49 v_lshl_add_u32 v26, v41, 1, v25 v_add_u32_e32 v27, s20, v41 buffer_load_dwordx4 v[9:12], v17, s[12:15], 0 offen buffer_load_dwordx4 v[13:16], v18, s[12:15], 0 offen v_lshl_add_u32 v28, v27, 1, v25 buffer_load_dwordx4 v[17:20], v26, s[16:19], 0 offen buffer_load_dwordx4 v[21:24], v28, s[16:19], 0 offen v_add_u32_e32 v26, s20, v27 v_add_u32_e32 v55, s20, v26 v_lshl_add_u32 v48, v26, 1, v25 v_lshl_add_u32 v50, v55, 1, v25 v_add_u32_e32 v25, 64, v56 v_lshrrev_b32_e32 v25, 3, v25 v_sub_u32_e32 v25, v25, v64 v_mul_lo_u32 v25, v25, s24 s_movk_i32 s0, 0x80 s_movk_i32 s1, 0xc0 v_and_b32_e32 v26, 7, v56 v_add_u32_e32 v27, s0, v56 v_add_u32_e32 v28, s1, v56 v_sub_u32_e32 v26, v26, v59 v_lshrrev_b32_e32 v27, 3, v27 v_lshrrev_b32_e32 v28, 3, v28 v_sub_u32_e32 v27, v27, v64 v_sub_u32_e32 v28, v28, v64 v_lshlrev_b32_e32 v26, 3, v26 v_add_u32_e32 v54, 64, v31 v_mul_lo_u32 v27, v27, s24 v_mul_lo_u32 v28, v28, s24 v_add_u32_e32 v51, v26, v25 v_ashrrev_i32_e32 v25, 31, v54 v_lshrrev_b32_e32 v25, 29, v25 v_add_u32_e32 v56, v54, v25 v_lshlrev_b32_e32 v52, 3, v59 v_mad_i32_i24 v60, v64, s24, v60 v_ashrrev_i32_e32 v25, 3, v56 v_add_lshl_u32 v41, v60, v52, 1 v_add_u32_e32 v52, v26, v27 v_add_u32_e32 v53, v26, v28 v_sub_u32_e32 v59, v25, v46 buffer_load_dwordx4 v[25:28], v48, s[16:19], 0 offen buffer_load_dwordx4 v[33:36], v50, s[16:19], 0 offen s_mov_b32 s26, 0xffffff8 v_mul_lo_u32 v48, v59, s24 v_and_b32_e32 v50, s26, v56 v_sub_u32_e32 v50, v54, v50 v_sub_u32_e32 v50, v50, v62 v_lshl_add_u32 v59, v50, 3, v48 v_add_u32_e32 v48, s0, v31 v_ashrrev_i32_e32 v50, 31, v48 v_lshrrev_b32_e32 v50, 29, v50 v_add_u32_e32 v50, v48, v50 v_ashrrev_i32_e32 v54, 3, v50 v_sub_u32_e32 v54, v54, v46 v_mul_lo_u32 v54, v54, s24 v_and_b32_e32 v50, s26, v50 v_sub_u32_e32 v48, v48, v50 v_sub_u32_e32 v48, v48, v62 v_lshl_add_u32 v60, v48, 3, v54 v_add_u32_e32 v48, s1, v31 v_ashrrev_i32_e32 v50, 31, v48 v_lshrrev_b32_e32 v50, 29, v50 v_add_u32_e32 v50, v48, v50 v_ashrrev_i32_e32 v54, 3, v50 v_sub_u32_e32 v61, v54, v46 v_and_b32_e32 v46, s26, v50 v_lshl_add_u32 v50, v51, 1, v41 v_mul_lo_u32 v51, v61, s24 s_movk_i32 s0, 0x4400 v_sub_u32_e32 v64, v48, v46 v_lshl_add_u32 v54, v63, 1, s0 v_lshl_add_u32 v46, v53, 1, v41 v_lshl_add_u32 v53, v59, 1, v54 v_sub_u32_e32 v59, v64, v62 v_accvgpr_write_b32 a241, 0 v_accvgpr_write_b32 a242, 0 v_accvgpr_write_b32 a243, 0 v_accvgpr_write_b32 a244, 0 v_accvgpr_write_b32 a245, 0 v_accvgpr_write_b32 a246, 0 v_accvgpr_write_b32 a247, 0 v_accvgpr_write_b32 a248, 0 v_accvgpr_write_b32 a249, 0 v_accvgpr_write_b32 a250, 0 v_accvgpr_write_b32 a251, 0 v_accvgpr_write_b32 a252, 0 v_accvgpr_write_b32 a253, 0 v_accvgpr_write_b32 a254, 0 v_accvgpr_write_b32 a255, 0 v_accvgpr_write_b32 a224, 0 v_accvgpr_write_b32 a208, 0 v_accvgpr_write_b32 a192, 0 v_accvgpr_write_b32 a128, 0 v_accvgpr_write_b32 a144, 0 v_accvgpr_write_b32 a160, 0 v_accvgpr_write_b32 a176, 0 v_accvgpr_write_b32 a112, 0 v_accvgpr_write_b32 a96, 0 v_accvgpr_write_b32 a80, 0 v_accvgpr_write_b32 a64, 0 v_accvgpr_write_b32 a0, 0 v_accvgpr_write_b32 a16, 0 v_accvgpr_write_b32 a32, 0 v_accvgpr_write_b32 a48, 0 v_lshl_add_u32 v51, v59, 3, v51 v_add_u32_e32 v57, 8, v57 v_add_u32_e32 v58, 8, v58 v_accvgpr_write_b32 a225, 0 v_accvgpr_write_b32 a226, 0 v_accvgpr_write_b32 a227, 0 v_accvgpr_write_b32 a228, 0 v_accvgpr_write_b32 a229, 0 v_accvgpr_write_b32 a230, 0 v_accvgpr_write_b32 a231, 0 v_accvgpr_write_b32 a232, 0 v_accvgpr_write_b32 a233, 0 v_accvgpr_write_b32 a234, 0 v_accvgpr_write_b32 a235, 0 v_accvgpr_write_b32 a236, 0 v_accvgpr_write_b32 a237, 0 v_accvgpr_write_b32 a238, 0 v_accvgpr_write_b32 a239, 0 v_accvgpr_write_b32 a209, 0 v_accvgpr_write_b32 a210, 0 v_accvgpr_write_b32 a211, 0 v_accvgpr_write_b32 a212, 0 v_accvgpr_write_b32 a213, 0 v_accvgpr_write_b32 a214, 0 v_accvgpr_write_b32 a215, 0 v_accvgpr_write_b32 a216, 0 v_accvgpr_write_b32 a217, 0 v_accvgpr_write_b32 a218, 0 v_accvgpr_write_b32 a219, 0 v_accvgpr_write_b32 a220, 0 v_accvgpr_write_b32 a221, 0 v_accvgpr_write_b32 a222, 0 v_accvgpr_write_b32 a223, 0 v_accvgpr_write_b32 a193, 0 v_accvgpr_write_b32 a194, 0 v_accvgpr_write_b32 a195, 0 v_accvgpr_write_b32 a196, 0 v_accvgpr_write_b32 a197, 0 v_accvgpr_write_b32 a198, 0 v_accvgpr_write_b32 a199, 0 v_accvgpr_write_b32 a200, 0 v_accvgpr_write_b32 a201, 0 v_accvgpr_write_b32 a202, 0 v_accvgpr_write_b32 a203, 0 v_accvgpr_write_b32 a204, 0 v_accvgpr_write_b32 a205, 0 v_accvgpr_write_b32 a206, 0 v_accvgpr_write_b32 a207, 0 v_accvgpr_write_b32 a129, 0 v_accvgpr_write_b32 a130, 0 v_accvgpr_write_b32 a131, 0 v_accvgpr_write_b32 a132, 0 v_accvgpr_write_b32 a133, 0 v_accvgpr_write_b32 a134, 0 v_accvgpr_write_b32 a135, 0 v_accvgpr_write_b32 a136, 0 v_accvgpr_write_b32 a137, 0 v_accvgpr_write_b32 a138, 0 v_accvgpr_write_b32 a139, 0 v_accvgpr_write_b32 a140, 0 v_accvgpr_write_b32 a141, 0 v_accvgpr_write_b32 a142, 0 v_accvgpr_write_b32 a143, 0 v_accvgpr_write_b32 a145, 0 v_accvgpr_write_b32 a146, 0 v_accvgpr_write_b32 a147, 0 v_accvgpr_write_b32 a148, 0 v_accvgpr_write_b32 a149, 0 v_accvgpr_write_b32 a150, 0 v_accvgpr_write_b32 a151, 0 v_accvgpr_write_b32 a152, 0 v_accvgpr_write_b32 a153, 0 v_accvgpr_write_b32 a154, 0 v_accvgpr_write_b32 a155, 0 v_accvgpr_write_b32 a156, 0 v_accvgpr_write_b32 a157, 0 v_accvgpr_write_b32 a158, 0 v_accvgpr_write_b32 a159, 0 v_accvgpr_write_b32 a161, 0 v_accvgpr_write_b32 a162, 0 v_accvgpr_write_b32 a163, 0 v_accvgpr_write_b32 a164, 0 v_accvgpr_write_b32 a165, 0 v_accvgpr_write_b32 a166, 0 v_accvgpr_write_b32 a167, 0 v_accvgpr_write_b32 a168, 0 v_accvgpr_write_b32 a169, 0 v_accvgpr_write_b32 a170, 0 v_accvgpr_write_b32 a171, 0 v_accvgpr_write_b32 a172, 0 v_accvgpr_write_b32 a173, 0 v_accvgpr_write_b32 a174, 0 v_accvgpr_write_b32 a175, 0 v_accvgpr_write_b32 a177, 0 v_accvgpr_write_b32 a178, 0 v_accvgpr_write_b32 a179, 0 v_accvgpr_write_b32 a180, 0 v_accvgpr_write_b32 a181, 0 v_accvgpr_write_b32 a182, 0 v_accvgpr_write_b32 a183, 0 v_accvgpr_write_b32 a184, 0 v_accvgpr_write_b32 a185, 0 v_accvgpr_write_b32 a186, 0 v_accvgpr_write_b32 a187, 0 v_accvgpr_write_b32 a188, 0 v_accvgpr_write_b32 a189, 0 v_accvgpr_write_b32 a190, 0 v_accvgpr_write_b32 a191, 0 v_accvgpr_write_b32 a113, 0 v_accvgpr_write_b32 a114, 0 v_accvgpr_write_b32 a115, 0 v_accvgpr_write_b32 a116, 0 v_accvgpr_write_b32 a117, 0 v_accvgpr_write_b32 a118, 0 v_accvgpr_write_b32 a119, 0 v_accvgpr_write_b32 a120, 0 v_accvgpr_write_b32 a121, 0 v_accvgpr_write_b32 a122, 0 v_accvgpr_write_b32 a123, 0 v_accvgpr_write_b32 a124, 0 v_accvgpr_write_b32 a125, 0 v_accvgpr_write_b32 a126, 0 v_accvgpr_write_b32 a127, 0 v_accvgpr_write_b32 a97, 0 v_accvgpr_write_b32 a98, 0 v_accvgpr_write_b32 a99, 0 v_accvgpr_write_b32 a100, 0 v_accvgpr_write_b32 a101, 0 v_accvgpr_write_b32 a102, 0 v_accvgpr_write_b32 a103, 0 v_accvgpr_write_b32 a104, 0 v_accvgpr_write_b32 a105, 0 v_accvgpr_write_b32 a106, 0 v_accvgpr_write_b32 a107, 0 v_accvgpr_write_b32 a108, 0 v_accvgpr_write_b32 a109, 0 v_accvgpr_write_b32 a110, 0 v_accvgpr_write_b32 a111, 0 v_accvgpr_write_b32 a81, 0 v_accvgpr_write_b32 a82, 0 v_accvgpr_write_b32 a83, 0 v_accvgpr_write_b32 a84, 0 v_accvgpr_write_b32 a85, 0 v_accvgpr_write_b32 a86, 0 v_accvgpr_write_b32 a87, 0 v_accvgpr_write_b32 a88, 0 v_accvgpr_write_b32 a89, 0 v_accvgpr_write_b32 a90, 0 v_accvgpr_write_b32 a91, 0 v_accvgpr_write_b32 a92, 0 v_accvgpr_write_b32 a93, 0 v_accvgpr_write_b32 a94, 0 v_accvgpr_write_b32 a95, 0 v_accvgpr_write_b32 a65, 0 v_accvgpr_write_b32 a66, 0 v_accvgpr_write_b32 a67, 0 v_accvgpr_write_b32 a68, 0 v_accvgpr_write_b32 a69, 0 v_accvgpr_write_b32 a70, 0 v_accvgpr_write_b32 a71, 0 v_accvgpr_write_b32 a72, 0 v_accvgpr_write_b32 a73, 0 v_accvgpr_write_b32 a74, 0 v_accvgpr_write_b32 a75, 0 v_accvgpr_write_b32 a76, 0 v_accvgpr_write_b32 a77, 0 v_accvgpr_write_b32 a78, 0 v_accvgpr_write_b32 a79, 0 v_accvgpr_write_b32 a1, 0 v_accvgpr_write_b32 a2, 0 v_accvgpr_write_b32 a3, 0 v_accvgpr_write_b32 a4, 0 v_accvgpr_write_b32 a5, 0 v_accvgpr_write_b32 a6, 0 v_accvgpr_write_b32 a7, 0 v_accvgpr_write_b32 a8, 0 v_accvgpr_write_b32 a9, 0 v_accvgpr_write_b32 a10, 0 v_accvgpr_write_b32 a11, 0 v_accvgpr_write_b32 a12, 0 v_accvgpr_write_b32 a13, 0 v_accvgpr_write_b32 a14, 0 v_accvgpr_write_b32 a15, 0 v_accvgpr_write_b32 a17, 0 v_accvgpr_write_b32 a18, 0 v_accvgpr_write_b32 a19, 0 v_accvgpr_write_b32 a20, 0 v_accvgpr_write_b32 a21, 0 v_accvgpr_write_b32 a22, 0 v_accvgpr_write_b32 a23, 0 v_accvgpr_write_b32 a24, 0 v_accvgpr_write_b32 a25, 0 v_accvgpr_write_b32 a26, 0 v_accvgpr_write_b32 a27, 0 v_accvgpr_write_b32 a28, 0 v_accvgpr_write_b32 a29, 0 v_accvgpr_write_b32 a30, 0 v_accvgpr_write_b32 a31, 0 v_accvgpr_write_b32 a33, 0 v_accvgpr_write_b32 a34, 0 v_accvgpr_write_b32 a35, 0 v_accvgpr_write_b32 a36, 0 v_accvgpr_write_b32 a37, 0 v_accvgpr_write_b32 a38, 0 v_accvgpr_write_b32 a39, 0 v_accvgpr_write_b32 a40, 0 v_accvgpr_write_b32 a41, 0 v_accvgpr_write_b32 a42, 0 v_accvgpr_write_b32 a43, 0 v_accvgpr_write_b32 a44, 0 v_accvgpr_write_b32 a45, 0 v_accvgpr_write_b32 a46, 0 v_accvgpr_write_b32 a47, 0 v_accvgpr_write_b32 a49, 0 v_accvgpr_write_b32 a50, 0 v_accvgpr_write_b32 a51, 0 v_accvgpr_write_b32 a52, 0 v_accvgpr_write_b32 a53, 0 v_accvgpr_write_b32 a54, 0 v_accvgpr_write_b32 a55, 0 v_accvgpr_write_b32 a56, 0 v_accvgpr_write_b32 a57, 0 v_accvgpr_write_b32 a58, 0 v_accvgpr_write_b32 a59, 0 v_accvgpr_write_b32 a60, 0 v_accvgpr_write_b32 a61, 0 v_accvgpr_write_b32 a62, 0 v_accvgpr_write_b32 a63, 0 s_mov_b32 s34, 0 s_mov_b32 s4, s29 v_add_u32_e32 v56, s0, v40 v_lshl_add_u32 v48, v52, 1, v41 v_lshl_add_u32 v52, v60, 1, v54 v_lshl_add_u32 v51, v51, 1, v54 s_add_i32 s24, s40, -4 s_sub_i32 s26, 0, s39 s_sub_i32 s27, 0, s44 s_movk_i32 s29, 0x1000 v_mov_b32_e32 v59, v58 v_mov_b32_e32 v60, v57 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND .LBB0_1: ; %_ZZN2ck22move_tensor_coordinateINS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS2_IJiiiEEELb0EEENS3_INS2_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESB_NS_23Merge_v2_magic_divisionINS2_IJiiEEEEESB_NSA_IS7_EENS3_ISD_Lb0EEESB_SF_EEENS2_IJNS_8SequenceIJLi0EEEENSI_IJLi1EEEENSI_IJLi2EEEENSI_IJLi3EEEENSI_IJLi4ELi6EEEENSI_IJLi7EEEENSI_IJLi5EEEENSI_IJLi8EEEENSI_IJLi9EEEENSI_IJLi10EEEEEEENS2_IJNSI_IJLi1ELi2ELi3EEEENSI_IJLi4ELi5EEEENSI_IJLi6EEEESO_SQ_SR_SS_NSI_IJLi11ELi12EEEENSI_IJLi13EEEENSI_IJLi14EEEEEEENSI_IJLi11ELi12ELi13ELi14EEEEiEENS_16TensorCoordinateILi15EKS11_EENS_20TensorCoordinateStepILi10ELi4ENSI_IJLi0ELi0ELi0ELi0ELi0ELi0ELi0ELi0ELi0ELi0EEEEEEEEvRKT_RT0_RKT1_ENKUlS19_E_clINS6_IiLi9EEEEEDaS19_.exit.i.i.i.i.i333.i ; =>This Inner Loop Header: Depth=1 ds_read2_b64 v[61:64], v41 offset1:1 ds_read2_b64 v[65:68], v54 offset1:1 ds_read2_b64 v[69:72], v53 offset1:1 ds_read2_b64 v[73:76], v52 offset1:1 ds_read2_b64 v[77:80], v51 offset1:1 ds_read2_b64 v[81:84], v50 offset1:1 s_waitcnt lgkmcnt(4) v_mfma_f32_32x32x8f16 a[240:255], v[61:62], v[65:66], a[240:255] ds_read2_b64 v[85:88], v48 offset1:1 ds_read2_b64 v[89:92], v46 offset1:1 v_add_u32_e32 v37, 4, v37 s_waitcnt lgkmcnt(5) v_mfma_f32_32x32x8f16 a[224:239], v[61:62], v[69:70], a[224:239] s_waitcnt lgkmcnt(4) v_mfma_f32_32x32x8f16 a[208:223], v[61:62], v[73:74], a[208:223] s_waitcnt lgkmcnt(3) v_mfma_f32_32x32x8f16 a[192:207], v[61:62], v[77:78], a[192:207] v_add_u32_e32 v61, s29, v41 s_waitcnt lgkmcnt(2) v_mfma_f32_32x32x8f16 a[128:143], v[81:82], v[65:66], a[128:143] v_mfma_f32_32x32x8f16 a[144:159], v[81:82], v[69:70], a[144:159] v_mfma_f32_32x32x8f16 a[160:175], v[81:82], v[73:74], a[160:175] v_mfma_f32_32x32x8f16 a[176:191], v[81:82], v[77:78], a[176:191] s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[112:127], v[85:86], v[65:66], a[112:127] v_mfma_f32_32x32x8f16 a[96:111], v[85:86], v[69:70], a[96:111] v_mfma_f32_32x32x8f16 a[80:95], v[85:86], v[73:74], a[80:95] v_mfma_f32_32x32x8f16 a[64:79], v[85:86], v[77:78], a[64:79] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[0:15], v[89:90], v[65:66], a[0:15] v_add_u32_e32 v65, s29, v54 v_mfma_f32_32x32x8f16 a[16:31], v[89:90], v[69:70], a[16:31] v_add_u32_e32 v69, s29, v53 v_mfma_f32_32x32x8f16 a[32:47], v[89:90], v[73:74], a[32:47] v_add_u32_e32 v73, s29, v52 v_mfma_f32_32x32x8f16 a[48:63], v[89:90], v[77:78], a[48:63] v_add_u32_e32 v77, s29, v51 v_mfma_f32_32x32x8f16 a[240:255], v[63:64], v[67:68], a[240:255] v_mfma_f32_32x32x8f16 a[224:239], v[63:64], v[71:72], a[224:239] v_mfma_f32_32x32x8f16 a[208:223], v[63:64], v[75:76], a[208:223] v_mfma_f32_32x32x8f16 a[192:207], v[63:64], v[79:80], a[192:207] v_mfma_f32_32x32x8f16 a[128:143], v[83:84], v[67:68], a[128:143] v_mfma_f32_32x32x8f16 a[144:159], v[83:84], v[71:72], a[144:159] v_mfma_f32_32x32x8f16 a[160:175], v[83:84], v[75:76], a[160:175] v_mfma_f32_32x32x8f16 a[176:191], v[83:84], v[79:80], a[176:191] v_mfma_f32_32x32x8f16 a[112:127], v[87:88], v[67:68], a[112:127] v_mfma_f32_32x32x8f16 a[96:111], v[87:88], v[71:72], a[96:111] v_mfma_f32_32x32x8f16 a[80:95], v[87:88], v[75:76], a[80:95] v_mfma_f32_32x32x8f16 a[64:79], v[87:88], v[79:80], a[64:79] v_mfma_f32_32x32x8f16 a[0:15], v[91:92], v[67:68], a[0:15] ds_read2_b64 v[93:96], v61 offset0:32 offset1:33 ds_read2_b64 v[97:100], v65 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[16:31], v[91:92], v[71:72], a[16:31] ds_read2_b64 v[101:104], v69 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[32:47], v[91:92], v[75:76], a[32:47] ds_read2_b64 v[105:108], v73 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[48:63], v[91:92], v[79:80], a[48:63] ds_read2_b64 v[109:112], v77 offset0:32 offset1:33 s_waitcnt lgkmcnt(3) v_mfma_f32_32x32x8f16 a[240:255], v[93:94], v[97:98], a[240:255] s_waitcnt lgkmcnt(2) v_mfma_f32_32x32x8f16 a[224:239], v[93:94], v[101:102], a[224:239] s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[208:223], v[93:94], v[105:106], a[208:223] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[192:207], v[93:94], v[109:110], a[192:207] v_add_u32_e32 v61, s29, v50 ds_read2_b64 v[113:116], v61 offset0:32 offset1:33 v_add_u32_e32 v61, s29, v48 ds_read2_b64 v[117:120], v61 offset0:32 offset1:33 v_add_u32_e32 v61, s29, v46 ds_read2_b64 v[121:124], v61 offset0:32 offset1:33 s_waitcnt lgkmcnt(2) v_mfma_f32_32x32x8f16 a[128:143], v[113:114], v[97:98], a[128:143] ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[112:127], v[117:118], v[97:98], a[112:127] s_waitcnt vmcnt(6) ;;#ASMSTART v_pack_b32_f16 v61, v1, v5 ;;#ASMEND v_mul_hi_u32 v65, s4, v60 v_mul_hi_u32 v66, s10, v59 v_add_u32_e32 v60, 4, v60 v_add_u32_e32 v59, 4, v59 ;;#ASMSTART v_pack_b32_f16 v63, v1, v5, op_sel:[1, 1] ;;#ASMEND s_waitcnt vmcnt(4) ;;#ASMSTART v_pack_b32_f16 v62, v9, v13 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v64, v9, v13, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v1, v2, v6 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v5, v2, v6, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[0:15], v[121:122], v[97:98], a[0:15] v_pack_b32_f16 v2, v10, v14 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v6, v10, v14, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v9, v3, v7 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v13, v3, v7, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v10, v11, v15 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v14, v11, v15, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v3, v4, v8 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v7, v4, v8, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v4, v12, v16 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v8, v12, v16, op_sel:[1, 1] ;;#ASMEND v_mfma_f32_32x32x8f16 a[240:255], v[95:96], v[99:100], a[240:255] ds_write2_b64 v40, v[61:62], v[63:64] offset1:2 v_mfma_f32_32x32x8f16 a[224:239], v[95:96], v[103:104], a[224:239] ds_write2_b64 v40, v[1:2], v[5:6] offset0:4 offset1:6 v_mfma_f32_32x32x8f16 a[208:223], v[95:96], v[107:108], a[208:223] ds_write2_b64 v40, v[9:10], v[13:14] offset0:8 offset1:10 v_mfma_f32_32x32x8f16 a[192:207], v[95:96], v[111:112], a[192:207] ds_write2_b64 v40, v[3:4], v[7:8] offset0:12 offset1:14 s_waitcnt vmcnt(2) ;;#ASMSTART v_pack_b32_f16 v1, v17, v21 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v3, v17, v21, op_sel:[1, 1] ;;#ASMEND s_waitcnt vmcnt(0) ;;#ASMSTART v_pack_b32_f16 v2, v25, v33 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v4, v25, v33, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_mfma_f32_32x32x8f16 a[144:159], v[113:114], v[101:102], a[144:159] v_pack_b32_f16 v5, v18, v22 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v7, v18, v22, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v6, v26, v34 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v8, v26, v34, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v9, v19, v23 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v11, v19, v23, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v10, v27, v35 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v12, v27, v35, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v13, v20, v24 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v15, v20, v24, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v14, v28, v36 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v16, v28, v36, op_sel:[1, 1] ;;#ASMEND v_mfma_f32_32x32x8f16 a[96:111], v[117:118], v[101:102], a[96:111] ds_write2_b64 v56, v[1:2], v[3:4] offset1:2 v_mfma_f32_32x32x8f16 a[16:31], v[121:122], v[101:102], a[16:31] ds_write2_b64 v56, v[5:6], v[7:8] offset0:4 offset1:6 v_mfma_f32_32x32x8f16 a[160:175], v[113:114], v[105:106], a[160:175] ds_write2_b64 v56, v[9:10], v[11:12] offset0:8 offset1:10 v_mfma_f32_32x32x8f16 a[176:191], v[113:114], v[109:110], a[176:191] ds_write2_b64 v56, v[13:14], v[15:16] offset0:12 offset1:14 v_add3_u32 v1, v58, v66, s34 v_add3_u32 v2, v57, v65, s34 v_lshrrev_b32_e32 v61, s11, v1 v_lshrrev_b32_e32 v62, s31, v2 v_mul_lo_u32 v63, s27, v61 v_mfma_f32_32x32x8f16 a[80:95], v[117:118], v[105:106], a[80:95] v_mul_hi_u32 v3, v62, s28 v_mul_lo_u32 v2, s26, v62 v_sub_u32_e32 v4, v63, v43 v_add_u32_e32 v3, v62, v3 v_sub_u32_e32 v2, v2, v47 v_lshrrev_b32_e32 v64, s30, v3 v_add3_u32 v2, v57, s34, v2 v_mul_lo_u32 v2, v2, s37 v_sub_u32_e32 v1, v61, v42 v_sub_u32_e32 v5, v64, v45 v_lshl_add_u32 v1, v1, 3, -3 v_mfma_f32_32x32x8f16 a[64:79], v[117:118], v[109:110], a[64:79] v_add_u32_e32 v38, v2, v38 v_mul_lo_u32 v9, v2, s22 v_lshl_add_u32 v5, v5, 3, -3 v_mul_lo_u32 v1, v1, s2 v_mul_lo_u32 v17, v5, s20 v_cmp_le_i32_e32 vcc, s6, v38 v_cmp_gt_i32_e64 s[0:1], s5, v38 s_and_b64 s[44:45], vcc, s[0:1] v_add_u32_e32 v19, v9, v55 v_mul_lo_u32 v43, v62, s39 v_mfma_f32_32x32x8f16 a[32:47], v[121:122], v[105:106], a[32:47] v_mov_b32_e32 v42, v61 v_mov_b32_e32 v45, v64 v_sub_u32_e32 v47, v37, v43 v_add_u32_e32 v69, s34, v58 v_add_u32_e32 v4, v69, v4 v_mul_lo_u32 v3, v4, s3 v_mul_lo_u32 v4, v64, s38 v_mov_b32_e32 v70, s23 s_add_i32 s34, s34, 4 v_add3_u32 v1, v1, v49, v3 v_sub_u32_e32 v65, v62, v4 v_sub_u32_e32 v2, v65, v44 v_mfma_f32_32x32x8f16 a[48:63], v[121:122], v[109:110], a[48:63] v_mul_lo_u32 v18, v2, s35 v_lshlrev_b32_e32 v3, 1, v1 v_add_u32_e32 v1, s2, v1 v_add_u32_e32 v10, s2, v1 v_add_u32_e32 v39, v18, v39 v_mul_lo_u32 v18, v18, s21 v_cmp_le_i32_e32 vcc, s25, v39 v_cmp_gt_i32_e64 s[0:1], s7, v39 s_and_b64 s[0:1], vcc, s[0:1] s_and_b64 s[0:1], s[44:45], s[0:1] v_cndmask_b32_e64 v25, v70, 0, s[0:1] v_add3_u32 v17, v19, v17, v18 v_mfma_f32_32x32x8f16 a[128:143], v[115:116], v[99:100], a[128:143] v_lshl_add_u32 v18, v17, 1, v25 v_add_u32_e32 v17, s20, v17 v_add_u32_e32 v26, s20, v17 v_add_u32_e32 v49, s2, v10 v_add_u32_e32 v55, s20, v26 v_lshlrev_b32_e32 v5, 1, v1 v_lshlrev_b32_e32 v11, 1, v10 v_lshlrev_b32_e32 v13, 1, v49 v_lshl_add_u32 v21, v17, 1, v25 v_lshl_add_u32 v27, v26, 1, v25 v_lshl_add_u32 v33, v55, 1, v25 v_mfma_f32_32x32x8f16 a[144:159], v[115:116], v[103:104], a[144:159] buffer_load_dwordx4 v[1:4], v3, s[12:15], 0 offen s_nop 0 v_mfma_f32_32x32x8f16 a[160:175], v[115:116], v[107:108], a[160:175] buffer_load_dwordx4 v[5:8], v5, s[12:15], 0 offen s_nop 0 v_mfma_f32_32x32x8f16 a[176:191], v[115:116], v[111:112], a[176:191] buffer_load_dwordx4 v[9:12], v11, s[12:15], 0 offen s_nop 0 v_mfma_f32_32x32x8f16 a[112:127], v[119:120], v[99:100], a[112:127] buffer_load_dwordx4 v[13:16], v13, s[12:15], 0 offen ;;#ASMSTART s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[96:111], v[119:120], v[103:104], a[96:111] s_barrier ;;#ASMEND v_mfma_f32_32x32x8f16 a[80:95], v[119:120], v[107:108], a[80:95] buffer_load_dwordx4 v[17:20], v18, s[16:19], 0 offen s_nop 0 v_mfma_f32_32x32x8f16 a[64:79], v[119:120], v[111:112], a[64:79] buffer_load_dwordx4 v[21:24], v21, s[16:19], 0 offen v_mfma_f32_32x32x8f16 a[0:15], v[123:124], v[99:100], a[0:15] buffer_load_dwordx4 v[25:28], v27, s[16:19], 0 offen v_add_u32_e32 v44, v69, v63 v_mfma_f32_32x32x8f16 a[16:31], v[123:124], v[103:104], a[16:31] buffer_load_dwordx4 v[33:36], v33, s[16:19], 0 offen v_mov_b32_e32 v43, v44 s_cmp_lt_i32 s34, s24 v_mov_b32_e32 v44, v65 v_mfma_f32_32x32x8f16 a[32:47], v[123:124], v[107:108], a[32:47] v_mfma_f32_32x32x8f16 a[48:63], v[123:124], v[111:112], a[48:63] s_cbranch_scc1 .LBB0_1 ; %bb.2: ; %_ZZN2ck23Merge_v2_magic_divisionINS_5TupleIJNS_17integral_constantIiLi4EEENS2_IiLi2EEEiiiEEEEC1ERKS5_ENKUlT_E_clIS4_EEDaS9_.exit.i.i.i.i.i.i.i.i ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_waitcnt vmcnt(7) ds_read2_b64 v[1:4], v41 offset1:1 s_waitcnt vmcnt(6) ds_read2_b64 v[5:8], v54 offset1:1 s_waitcnt vmcnt(5) ds_read2_b64 v[9:12], v53 offset1:1 s_waitcnt vmcnt(4) ds_read2_b64 v[13:16], v52 offset1:1 s_waitcnt vmcnt(3) ds_read2_b64 v[17:20], v51 offset1:1 s_waitcnt lgkmcnt(3) v_mfma_f32_32x32x8f16 a[240:255], v[1:2], v[5:6], a[240:255] s_movk_i32 s0, 0x1000 s_waitcnt lgkmcnt(2) v_mfma_f32_32x32x8f16 a[224:239], v[1:2], v[9:10], a[224:239] s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[208:223], v[1:2], v[13:14], a[208:223] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[192:207], v[1:2], v[17:18], a[192:207] v_mfma_f32_32x32x8f16 a[240:255], v[3:4], v[7:8], a[240:255] v_mfma_f32_32x32x8f16 a[224:239], v[3:4], v[11:12], a[224:239] v_mfma_f32_32x32x8f16 a[208:223], v[3:4], v[15:16], a[208:223] v_mfma_f32_32x32x8f16 a[192:207], v[3:4], v[19:20], a[192:207] ds_read2_b64 v[1:4], v50 offset1:1 s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[128:143], v[1:2], v[5:6], a[128:143] v_mfma_f32_32x32x8f16 a[144:159], v[1:2], v[9:10], a[144:159] v_mfma_f32_32x32x8f16 a[160:175], v[1:2], v[13:14], a[160:175] v_mfma_f32_32x32x8f16 a[176:191], v[1:2], v[17:18], a[176:191] v_mfma_f32_32x32x8f16 a[128:143], v[3:4], v[7:8], a[128:143] v_mfma_f32_32x32x8f16 a[144:159], v[3:4], v[11:12], a[144:159] v_mfma_f32_32x32x8f16 a[160:175], v[3:4], v[15:16], a[160:175] v_mfma_f32_32x32x8f16 a[176:191], v[3:4], v[19:20], a[176:191] ds_read2_b64 v[1:4], v48 offset1:1 s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[112:127], v[1:2], v[5:6], a[112:127] v_mfma_f32_32x32x8f16 a[96:111], v[1:2], v[9:10], a[96:111] v_mfma_f32_32x32x8f16 a[80:95], v[1:2], v[13:14], a[80:95] v_mfma_f32_32x32x8f16 a[64:79], v[1:2], v[17:18], a[64:79] v_mfma_f32_32x32x8f16 a[112:127], v[3:4], v[7:8], a[112:127] v_mfma_f32_32x32x8f16 a[96:111], v[3:4], v[11:12], a[96:111] v_mfma_f32_32x32x8f16 a[80:95], v[3:4], v[15:16], a[80:95] v_mfma_f32_32x32x8f16 a[64:79], v[3:4], v[19:20], a[64:79] ds_read2_b64 v[1:4], v46 offset1:1 s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[0:15], v[1:2], v[5:6], a[0:15] v_add_u32_e32 v5, s0, v54 v_mfma_f32_32x32x8f16 a[16:31], v[1:2], v[9:10], a[16:31] v_add_u32_e32 v9, s0, v53 v_mfma_f32_32x32x8f16 a[32:47], v[1:2], v[13:14], a[32:47] v_add_u32_e32 v13, s0, v52 v_mfma_f32_32x32x8f16 a[48:63], v[1:2], v[17:18], a[48:63] v_add_u32_e32 v1, s0, v41 v_add_u32_e32 v17, s0, v51 v_mfma_f32_32x32x8f16 a[0:15], v[3:4], v[7:8], a[0:15] ds_read2_b64 v[5:8], v5 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[16:31], v[3:4], v[11:12], a[16:31] ds_read2_b64 v[9:12], v9 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[32:47], v[3:4], v[15:16], a[32:47] ds_read2_b64 v[13:16], v13 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[48:63], v[3:4], v[19:20], a[48:63] ds_read2_b64 v[1:4], v1 offset0:32 offset1:33 ds_read2_b64 v[17:20], v17 offset0:32 offset1:33 s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[240:255], v[1:2], v[5:6], a[240:255] v_mfma_f32_32x32x8f16 a[224:239], v[1:2], v[9:10], a[224:239] v_mfma_f32_32x32x8f16 a[208:223], v[1:2], v[13:14], a[208:223] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[192:207], v[1:2], v[17:18], a[192:207] v_add_u32_e32 v1, s0, v50 s_waitcnt vmcnt(2) ds_read2_b64 v[21:24], v1 offset0:32 offset1:33 v_add_u32_e32 v1, s0, v48 s_waitcnt vmcnt(1) ds_read2_b64 v[25:28], v1 offset0:32 offset1:33 v_add_u32_e32 v1, s0, v46 ds_read2_b64 v[49:52], v1 offset0:32 offset1:33 s_movk_i32 s0, 0x80 v_cmp_gt_u32_e32 vcc, s0, v0 s_waitcnt lgkmcnt(2) v_mfma_f32_32x32x8f16 a[128:143], v[21:22], v[5:6], a[128:143] v_mfma_f32_32x32x8f16 a[144:159], v[21:22], v[9:10], a[144:159] v_mfma_f32_32x32x8f16 a[160:175], v[21:22], v[13:14], a[160:175] v_mfma_f32_32x32x8f16 a[176:191], v[21:22], v[17:18], a[176:191] s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[112:127], v[25:26], v[5:6], a[112:127] v_mfma_f32_32x32x8f16 a[96:111], v[25:26], v[9:10], a[96:111] v_mfma_f32_32x32x8f16 a[80:95], v[25:26], v[13:14], a[80:95] v_mfma_f32_32x32x8f16 a[64:79], v[25:26], v[17:18], a[64:79] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[0:15], v[49:50], v[5:6], a[0:15] v_mfma_f32_32x32x8f16 a[16:31], v[49:50], v[9:10], a[16:31] v_mfma_f32_32x32x8f16 a[32:47], v[49:50], v[13:14], a[32:47] v_mfma_f32_32x32x8f16 a[48:63], v[49:50], v[17:18], a[48:63] v_mul_i32_i24_e32 v17, 0xffffffe0, v29 v_mov_b32_e32 v49, 0 v_mov_b32_e32 v50, 0 v_mfma_f32_32x32x8f16 a[240:255], v[3:4], v[7:8], a[240:255] v_mfma_f32_32x32x8f16 a[224:239], v[3:4], v[11:12], a[224:239] s_waitcnt vmcnt(0) s_nop 7 s_nop 7 v_accvgpr_read_b32 v33, a240 v_accvgpr_read_b32 v34, a241 v_accvgpr_read_b32 v35, a242 v_accvgpr_read_b32 v36, a243 v_accvgpr_read_b32 v37, a244 v_accvgpr_read_b32 v38, a245 v_accvgpr_read_b32 v39, a246 v_accvgpr_read_b32 v40, a247 v_accvgpr_read_b32 v41, a248 v_accvgpr_read_b32 v42, a249 v_accvgpr_read_b32 v43, a250 v_accvgpr_read_b32 v44, a251 v_accvgpr_read_b32 v45, a252 v_mfma_f32_32x32x8f16 a[192:207], v[3:4], v[19:20], a[192:207] v_accvgpr_read_b32 v46, a253 v_accvgpr_read_b32 v47, a254 v_accvgpr_read_b32 v48, a255 v_mfma_f32_32x32x8f16 a[144:159], v[23:24], v[11:12], a[144:159] v_mfma_f32_32x32x8f16 a[176:191], v[23:24], v[19:20], a[176:191] v_mfma_f32_32x32x8f16 a[96:111], v[27:28], v[11:12], a[96:111] v_mfma_f32_32x32x8f16 a[64:79], v[27:28], v[19:20], a[64:79] v_mfma_f32_32x32x8f16 a[16:31], v[51:52], v[11:12], a[16:31] v_mfma_f32_32x32x8f16 a[48:63], v[51:52], v[19:20], a[48:63] v_mfma_f32_32x32x8f16 a[128:143], v[23:24], v[7:8], a[128:143] v_mfma_f32_32x32x8f16 a[112:127], v[27:28], v[7:8], a[112:127] v_mfma_f32_32x32x8f16 a[0:15], v[51:52], v[7:8], a[0:15] v_mfma_f32_32x32x8f16 a[208:223], v[3:4], v[15:16], a[208:223] v_mfma_f32_32x32x8f16 a[80:95], v[27:28], v[15:16], a[80:95] v_mfma_f32_32x32x8f16 a[160:175], v[23:24], v[15:16], a[160:175] v_mfma_f32_32x32x8f16 a[32:47], v[51:52], v[15:16], a[32:47] s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_4 ; %bb.3: v_lshrrev_b32_e32 v1, 2, v0 v_mul_i32_i24_e32 v2, -4, v1 v_add_u32_e32 v1, v17, v1 v_lshlrev_b32_e32 v3, 1, v1 v_add_u32_e32 v4, s42, v29 v_lshl_add_u32 v3, v4, 8, v3 v_mul_lo_u32 v3, v3, s41 v_add_lshl_u32 v2, v2, v0, 4 v_lshlrev_b32_e32 v4, 12, v29 v_lshlrev_b32_e32 v1, 7, v1 v_add3_u32 v50, v2, v4, v1 v_add3_u32 v49, s43, v2, v3 .LBB0_4: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EEC2ERSO_RKNSA_IJiiiiEEES15_S1A_RKS3_.exit.i s_or_b64 exec, exec, s[0:1] v_lshrrev_b32_e32 v0, 3, v0 v_and_or_b32 v0, v0, 4, v30 v_lshlrev_b32_e32 v19, 5, v29 v_lshrrev_b32_e32 v18, 6, v31 v_add3_u32 v0, v0, v17, v19 v_sub_u32_e32 v0, v0, v18 v_lshlrev_b32_e32 v0, 6, v0 v_cvt_f16_f32_e32 v17, v33 v_add_lshl_u32 v51, v0, v31, 1 v_cvt_f16_f32_e32 v0, v34 v_cvt_f16_f32_e32 v18, v35 v_cvt_f16_f32_e32 v19, v36 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v17 ds_write_b16 v51, v0 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v0, v40 v_cvt_f16_f32_e32 v17, v39 v_cvt_f16_f32_e32 v18, v38 v_cvt_f16_f32_e32 v19, v37 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v0, v41 v_cvt_f16_f32_e32 v17, v42 v_cvt_f16_f32_e32 v18, v43 v_cvt_f16_f32_e32 v19, v44 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v0, v48 v_cvt_f16_f32_e32 v17, v47 v_accvgpr_read_b32 v1, a224 v_cvt_f16_f32_e32 v18, v46 v_accvgpr_read_b32 v2, a225 v_accvgpr_read_b32 v3, a226 v_accvgpr_read_b32 v4, a227 v_accvgpr_read_b32 v5, a228 v_accvgpr_read_b32 v6, a229 v_accvgpr_read_b32 v7, a230 v_accvgpr_read_b32 v8, a231 v_accvgpr_read_b32 v9, a232 v_accvgpr_read_b32 v10, a233 v_accvgpr_read_b32 v11, a234 v_accvgpr_read_b32 v12, a235 v_accvgpr_read_b32 v13, a236 v_accvgpr_read_b32 v14, a237 v_accvgpr_read_b32 v15, a238 v_accvgpr_read_b32 v16, a239 v_cvt_f16_f32_e32 v19, v45 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_6 ; %bb.5: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i.i.i.i v_lshlrev_b32_e32 v0, 1, v50 ds_read_b128 v[17:20], v0 ds_read_b128 v[21:24], v0 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v25, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[17:20], v25, s[8:11], 0 offen ds_read_b128 v[17:20], v0 offset:144 ds_read_b128 v[25:28], v0 offset:128 v_add_u32_e32 v29, 8, v49 v_lshlrev_b32_e32 v30, 1, v29 v_add_lshl_u32 v0, v29, s41, 1 s_waitcnt lgkmcnt(2) buffer_store_dwordx4 v[21:24], v30, s[8:11], 0 offen s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[17:20], v0, s[8:11], 0 offen v_add_lshl_u32 v0, v49, s41, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[25:28], v0, s[8:11], 0 offen .LBB0_6: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v1 v_cvt_f16_f32_e32 v1, v2 v_cvt_f16_f32_e32 v2, v3 v_cvt_f16_f32_e32 v3, v4 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v7 v_cvt_f16_f32_e32 v2, v6 v_cvt_f16_f32_e32 v3, v5 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v9 v_cvt_f16_f32_e32 v1, v10 v_cvt_f16_f32_e32 v2, v11 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v16 v_cvt_f16_f32_e32 v1, v15 v_accvgpr_read_b32 v33, a208 v_cvt_f16_f32_e32 v2, v14 v_accvgpr_read_b32 v34, a209 v_accvgpr_read_b32 v35, a210 v_accvgpr_read_b32 v36, a211 v_accvgpr_read_b32 v37, a212 v_accvgpr_read_b32 v38, a213 v_accvgpr_read_b32 v39, a214 v_accvgpr_read_b32 v40, a215 v_accvgpr_read_b32 v41, a216 v_accvgpr_read_b32 v42, a217 v_accvgpr_read_b32 v43, a218 v_accvgpr_read_b32 v44, a219 v_accvgpr_read_b32 v45, a220 v_accvgpr_read_b32 v46, a221 v_accvgpr_read_b32 v47, a222 v_accvgpr_read_b32 v48, a223 v_cvt_f16_f32_e32 v3, v13 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_8 ; %bb.7: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i81.i.i.i.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v9, s[8:11], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 s_waitcnt lgkmcnt(2) buffer_store_dwordx4 v[4:7], v13, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s41, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[8:11], 0 offen .LBB0_8: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_106.i.i.i.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v33 v_cvt_f16_f32_e32 v1, v34 v_cvt_f16_f32_e32 v2, v35 v_cvt_f16_f32_e32 v3, v36 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v40 v_cvt_f16_f32_e32 v1, v39 v_cvt_f16_f32_e32 v2, v38 v_cvt_f16_f32_e32 v3, v37 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v41 v_cvt_f16_f32_e32 v1, v42 v_cvt_f16_f32_e32 v2, v43 v_cvt_f16_f32_e32 v3, v44 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v48 v_cvt_f16_f32_e32 v1, v47 v_accvgpr_read_b32 v16, a192 v_cvt_f16_f32_e32 v2, v46 v_accvgpr_read_b32 v17, a193 v_accvgpr_read_b32 v18, a194 v_accvgpr_read_b32 v19, a195 v_accvgpr_read_b32 v20, a196 v_accvgpr_read_b32 v21, a197 v_accvgpr_read_b32 v22, a198 v_accvgpr_read_b32 v23, a199 v_accvgpr_read_b32 v24, a200 v_accvgpr_read_b32 v25, a201 v_accvgpr_read_b32 v26, a202 v_accvgpr_read_b32 v27, a203 v_accvgpr_read_b32 v28, a204 v_accvgpr_read_b32 v29, a205 v_accvgpr_read_b32 v30, a206 v_accvgpr_read_b32 v31, a207 v_cvt_f16_f32_e32 v3, v45 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_10 ; %bb.9: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i187.i.i.i.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v9, s[8:11], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 s_waitcnt lgkmcnt(2) buffer_store_dwordx4 v[4:7], v13, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s41, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[8:11], 0 offen .LBB0_10: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_212.i.i.i.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a176 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a177 v_accvgpr_read_b32 v2, a178 v_accvgpr_read_b32 v3, a179 v_accvgpr_read_b32 v4, a180 v_accvgpr_read_b32 v5, a181 v_accvgpr_read_b32 v6, a182 v_accvgpr_read_b32 v7, a183 v_accvgpr_read_b32 v8, a184 v_accvgpr_read_b32 v9, a185 v_accvgpr_read_b32 v10, a186 v_accvgpr_read_b32 v11, a187 v_accvgpr_read_b32 v12, a188 v_accvgpr_read_b32 v13, a189 v_accvgpr_read_b32 v14, a190 v_accvgpr_read_b32 v15, a191 v_cvt_f16_f32_e32 v19, v28 s_mul_i32 s2, s41, 63 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_12 ; %bb.11: ; %_ZNK2ck10static_forILi0ELi4ELi1EEclIZZNS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS6_IJiiiEEELb0EEENS7_INS6_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESF_NS_23Merge_v2_magic_divisionINS6_IJiiEEEEESF_NSE_ISB_EENS7_ISH_Lb0EEESF_SJ_EEENS6_IJNS_8SequenceIJLi0EEEENSM_IJLi1EEEENSM_IJLi2EEEENSM_IJLi3EEEENSM_IJLi4ELi6EEEENSM_IJLi7EEEENSM_IJLi5EEEENSM_IJLi8EEEENSM_IJLi9EEEENSM_IJLi10EEEEEEENS6_IJNSM_IJLi1ELi2ELi3EEEENSM_IJLi4ELi5EEEENSM_IJLi6EEEESS_SU_SV_SW_NSM_IJLi11ELi12EEEENSM_IJLi13EEEENSM_IJLi14EEEEEEENSM_IJLi11ELi12ELi13ELi14EEEEiEENS5_INS6_IJNS7_INS6_IJiiiiEEELb0EEESF_NS_3PadIiiiLb0EEES1A_SF_SF_NS_5EmbedISH_SH_Lb0EEES1C_SF_SD_SF_SF_SF_SF_SF_NSG_IS8_EES1D_SJ_SK_SF_SJ_EEENS6_IJSN_SO_SP_SQ_NSM_IJLi4EEEEST_S10_SS_SU_SV_SW_NSM_IJLi11EEEENSM_IJLi12EEEES12_S13_NSM_IJLi15ELi18ELi20EEEENSM_IJLi17ELi19ELi21EEEENSM_IJLi16EEEENSM_IJLi22EEEENSM_IJLi23EEEENSM_IJLi24EEEEEEENS6_IJNSM_IJLi1ELi2ELi3ELi4EEEEST_.i.i.i.i v_lshlrev_b32_e32 v24, 1, v50 ds_read_b128 v[16:19], v24 ds_read_b128 v[20:23], v24 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v25, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v25, s[8:11], 0 offen v_add_u32_e32 v25, 8, v49 ds_read_b128 v[16:19], v24 offset:144 v_lshlrev_b32_e32 v26, 1, v25 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[20:23], v26, s[8:11], 0 offen ds_read_b128 v[20:23], v24 offset:128 v_add_lshl_u32 v24, v25, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v24, s[8:11], 0 offen s_nop 0 v_add_u32_e32 v16, s41, v49 v_lshlrev_b32_e32 v17, 1, v16 v_add_u32_e32 v49, s2, v16 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[20:23], v17, s[8:11], 0 offen .LBB0_12: ; %_ZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_I.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a160 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a161 v_accvgpr_read_b32 v18, a162 v_accvgpr_read_b32 v19, a163 v_accvgpr_read_b32 v20, a164 v_accvgpr_read_b32 v21, a165 v_accvgpr_read_b32 v22, a166 v_accvgpr_read_b32 v23, a167 v_accvgpr_read_b32 v24, a168 v_accvgpr_read_b32 v25, a169 v_accvgpr_read_b32 v26, a170 v_accvgpr_read_b32 v27, a171 v_accvgpr_read_b32 v28, a172 v_accvgpr_read_b32 v29, a173 v_accvgpr_read_b32 v30, a174 v_accvgpr_read_b32 v31, a175 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_14 ; %bb.13: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i92.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v9, s[8:11], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 s_waitcnt lgkmcnt(2) buffer_store_dwordx4 v[4:7], v13, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s41, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[8:11], 0 offen .LBB0_14: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i140.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a144 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a145 v_accvgpr_read_b32 v2, a146 v_accvgpr_read_b32 v3, a147 v_accvgpr_read_b32 v4, a148 v_accvgpr_read_b32 v5, a149 v_accvgpr_read_b32 v6, a150 v_accvgpr_read_b32 v7, a151 v_accvgpr_read_b32 v8, a152 v_accvgpr_read_b32 v9, a153 v_accvgpr_read_b32 v10, a154 v_accvgpr_read_b32 v11, a155 v_accvgpr_read_b32 v12, a156 v_accvgpr_read_b32 v13, a157 v_accvgpr_read_b32 v14, a158 v_accvgpr_read_b32 v15, a159 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_16 ; %bb.15: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i81.i.i.i192.i.i.i v_lshlrev_b32_e32 v24, 1, v50 ds_read_b128 v[16:19], v24 ds_read_b128 v[20:23], v24 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v25, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v25, s[8:11], 0 offen ds_read_b128 v[16:19], v24 offset:144 ds_read_b128 v[24:27], v24 offset:128 v_add_u32_e32 v28, 8, v49 v_lshlrev_b32_e32 v29, 1, v28 s_waitcnt lgkmcnt(2) buffer_store_dwordx4 v[20:23], v29, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v20, v28, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v20, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v16, v49, s41, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[24:27], v16, s[8:11], 0 offen .LBB0_16: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_106.i.i.i240.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a128 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a129 v_accvgpr_read_b32 v18, a130 v_accvgpr_read_b32 v19, a131 v_accvgpr_read_b32 v20, a132 v_accvgpr_read_b32 v21, a133 v_accvgpr_read_b32 v22, a134 v_accvgpr_read_b32 v23, a135 v_accvgpr_read_b32 v24, a136 v_accvgpr_read_b32 v25, a137 v_accvgpr_read_b32 v26, a138 v_accvgpr_read_b32 v27, a139 v_accvgpr_read_b32 v28, a140 v_accvgpr_read_b32 v29, a141 v_accvgpr_read_b32 v30, a142 v_accvgpr_read_b32 v31, a143 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_18 ; %bb.17: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i187.i.i.i292.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v9, s[8:11], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 s_waitcnt lgkmcnt(2) buffer_store_dwordx4 v[4:7], v13, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s41, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[8:11], 0 offen .LBB0_18: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_212.i.i.i340.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a112 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a113 v_accvgpr_read_b32 v2, a114 v_accvgpr_read_b32 v3, a115 v_accvgpr_read_b32 v4, a116 v_accvgpr_read_b32 v5, a117 v_accvgpr_read_b32 v6, a118 v_accvgpr_read_b32 v7, a119 v_accvgpr_read_b32 v8, a120 v_accvgpr_read_b32 v9, a121 v_accvgpr_read_b32 v10, a122 v_accvgpr_read_b32 v11, a123 v_accvgpr_read_b32 v12, a124 v_accvgpr_read_b32 v13, a125 v_accvgpr_read_b32 v14, a126 v_accvgpr_read_b32 v15, a127 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_20 ; %bb.19: ; %_ZNK2ck10static_forILi0ELi4ELi1EEclIZZNS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS6_IJiiiEEELb0EEENS7_INS6_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESF_NS_23Merge_v2_magic_divisionINS6_IJiiEEEEESF_NSE_ISB_EENS7_ISH_Lb0EEESF_SJ_EEENS6_IJNS_8SequenceIJLi0EEEENSM_IJLi1EEEENSM_IJLi2EEEENSM_IJLi3EEEENSM_IJLi4ELi6EEEENSM_IJLi7EEEENSM_IJLi5EEEENSM_IJLi8EEEENSM_IJLi9EEEENSM_IJLi10EEEEEEENS6_IJNSM_IJLi1ELi2ELi3EEEENSM_IJLi4ELi5EEEENSM_IJLi6EEEESS_SU_SV_SW_NSM_IJLi11ELi12EEEENSM_IJLi13EEEENSM_IJLi14EEEEEEENSM_IJLi11ELi12ELi13ELi14EEEEiEENS5_INS6_IJNS7_INS6_IJiiiiEEELb0EEESF_NS_3PadIiiiLb0EEES1A_SF_SF_NS_5EmbedISH_SH_Lb0EEES1C_SF_SD_SF_SF_SF_SF_SF_NSG_IS8_EES1D_SJ_SK_SF_SJ_EEENS6_IJSN_SO_SP_SQ_NSM_IJLi4EEEEST_S10_SS_SU_SV_SW_NSM_IJLi11EEEENSM_IJLi12EEEES12_S13_NSM_IJLi15ELi18ELi20EEEENSM_IJLi17ELi19ELi21EEEENSM_IJLi16EEEENSM_IJLi22EEEENSM_IJLi23EEEENSM_IJLi24EEEEEEENS6_IJNSM_IJLi1ELi2ELi3ELi4EEEEST_.i380.i.i.i v_lshlrev_b32_e32 v24, 1, v50 ds_read_b128 v[16:19], v24 ds_read_b128 v[20:23], v24 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v25, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v25, s[8:11], 0 offen v_add_u32_e32 v25, 8, v49 ds_read_b128 v[16:19], v24 offset:144 v_lshlrev_b32_e32 v26, 1, v25 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[20:23], v26, s[8:11], 0 offen ds_read_b128 v[20:23], v24 offset:128 v_add_lshl_u32 v24, v25, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v24, s[8:11], 0 offen s_nop 0 v_add_u32_e32 v16, s41, v49 v_lshlrev_b32_e32 v17, 1, v16 v_add_u32_e32 v49, s2, v16 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[20:23], v17, s[8:11], 0 offen .LBB0_20: ; %_ZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_I405.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a96 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a97 v_accvgpr_read_b32 v18, a98 v_accvgpr_read_b32 v19, a99 v_accvgpr_read_b32 v20, a100 v_accvgpr_read_b32 v21, a101 v_accvgpr_read_b32 v22, a102 v_accvgpr_read_b32 v23, a103 v_accvgpr_read_b32 v24, a104 v_accvgpr_read_b32 v25, a105 v_accvgpr_read_b32 v26, a106 v_accvgpr_read_b32 v27, a107 v_accvgpr_read_b32 v28, a108 v_accvgpr_read_b32 v29, a109 v_accvgpr_read_b32 v30, a110 v_accvgpr_read_b32 v31, a111 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_22 ; %bb.21: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i497.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v9, s[8:11], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 s_waitcnt lgkmcnt(2) buffer_store_dwordx4 v[4:7], v13, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s41, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[8:11], 0 offen .LBB0_22: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i545.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a80 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a81 v_accvgpr_read_b32 v2, a82 v_accvgpr_read_b32 v3, a83 v_accvgpr_read_b32 v4, a84 v_accvgpr_read_b32 v5, a85 v_accvgpr_read_b32 v6, a86 v_accvgpr_read_b32 v7, a87 v_accvgpr_read_b32 v8, a88 v_accvgpr_read_b32 v9, a89 v_accvgpr_read_b32 v10, a90 v_accvgpr_read_b32 v11, a91 v_accvgpr_read_b32 v12, a92 v_accvgpr_read_b32 v13, a93 v_accvgpr_read_b32 v14, a94 v_accvgpr_read_b32 v15, a95 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_24 ; %bb.23: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i81.i.i.i597.i.i.i v_lshlrev_b32_e32 v24, 1, v50 ds_read_b128 v[16:19], v24 ds_read_b128 v[20:23], v24 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v25, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v25, s[8:11], 0 offen ds_read_b128 v[16:19], v24 offset:144 ds_read_b128 v[24:27], v24 offset:128 v_add_u32_e32 v28, 8, v49 v_lshlrev_b32_e32 v29, 1, v28 s_waitcnt lgkmcnt(2) buffer_store_dwordx4 v[20:23], v29, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v20, v28, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v20, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v16, v49, s41, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[24:27], v16, s[8:11], 0 offen .LBB0_24: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_106.i.i.i645.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a64 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a65 v_accvgpr_read_b32 v18, a66 v_accvgpr_read_b32 v19, a67 v_accvgpr_read_b32 v20, a68 v_accvgpr_read_b32 v21, a69 v_accvgpr_read_b32 v22, a70 v_accvgpr_read_b32 v23, a71 v_accvgpr_read_b32 v24, a72 v_accvgpr_read_b32 v25, a73 v_accvgpr_read_b32 v26, a74 v_accvgpr_read_b32 v27, a75 v_accvgpr_read_b32 v28, a76 v_accvgpr_read_b32 v29, a77 v_accvgpr_read_b32 v30, a78 v_accvgpr_read_b32 v31, a79 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_26 ; %bb.25: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i187.i.i.i697.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v9, s[8:11], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 s_waitcnt lgkmcnt(2) buffer_store_dwordx4 v[4:7], v13, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s41, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[8:11], 0 offen .LBB0_26: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_212.i.i.i745.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a48 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a49 v_accvgpr_read_b32 v2, a50 v_accvgpr_read_b32 v3, a51 v_accvgpr_read_b32 v4, a52 v_accvgpr_read_b32 v5, a53 v_accvgpr_read_b32 v6, a54 v_accvgpr_read_b32 v7, a55 v_accvgpr_read_b32 v8, a56 v_accvgpr_read_b32 v9, a57 v_accvgpr_read_b32 v10, a58 v_accvgpr_read_b32 v11, a59 v_accvgpr_read_b32 v12, a60 v_accvgpr_read_b32 v13, a61 v_accvgpr_read_b32 v14, a62 v_accvgpr_read_b32 v15, a63 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_28 ; %bb.27: ; %_ZNK2ck10static_forILi0ELi4ELi1EEclIZZNS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS6_IJiiiEEELb0EEENS7_INS6_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESF_NS_23Merge_v2_magic_divisionINS6_IJiiEEEEESF_NSE_ISB_EENS7_ISH_Lb0EEESF_SJ_EEENS6_IJNS_8SequenceIJLi0EEEENSM_IJLi1EEEENSM_IJLi2EEEENSM_IJLi3EEEENSM_IJLi4ELi6EEEENSM_IJLi7EEEENSM_IJLi5EEEENSM_IJLi8EEEENSM_IJLi9EEEENSM_IJLi10EEEEEEENS6_IJNSM_IJLi1ELi2ELi3EEEENSM_IJLi4ELi5EEEENSM_IJLi6EEEESS_SU_SV_SW_NSM_IJLi11ELi12EEEENSM_IJLi13EEEENSM_IJLi14EEEEEEENSM_IJLi11ELi12ELi13ELi14EEEEiEENS5_INS6_IJNS7_INS6_IJiiiiEEELb0EEESF_NS_3PadIiiiLb0EEES1A_SF_SF_NS_5EmbedISH_SH_Lb0EEES1C_SF_SD_SF_SF_SF_SF_SF_NSG_IS8_EES1D_SJ_SK_SF_SJ_EEENS6_IJSN_SO_SP_SQ_NSM_IJLi4EEEEST_S10_SS_SU_SV_SW_NSM_IJLi11EEEENSM_IJLi12EEEES12_S13_NSM_IJLi15ELi18ELi20EEEENSM_IJLi17ELi19ELi21EEEENSM_IJLi16EEEENSM_IJLi22EEEENSM_IJLi23EEEENSM_IJLi24EEEEEEENS6_IJNSM_IJLi1ELi2ELi3ELi4EEEEST_.i785.i.i.i v_lshlrev_b32_e32 v24, 1, v50 ds_read_b128 v[16:19], v24 ds_read_b128 v[20:23], v24 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v25, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v25, s[8:11], 0 offen v_add_u32_e32 v25, 8, v49 ds_read_b128 v[16:19], v24 offset:144 v_lshlrev_b32_e32 v26, 1, v25 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[20:23], v26, s[8:11], 0 offen ds_read_b128 v[20:23], v24 offset:128 v_add_lshl_u32 v24, v25, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v24, s[8:11], 0 offen s_nop 0 v_add_u32_e32 v16, s41, v49 v_lshlrev_b32_e32 v17, 1, v16 v_add_u32_e32 v49, s2, v16 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[20:23], v17, s[8:11], 0 offen .LBB0_28: ; %_ZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_I810.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a32 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a33 v_accvgpr_read_b32 v18, a34 v_accvgpr_read_b32 v19, a35 v_accvgpr_read_b32 v20, a36 v_accvgpr_read_b32 v21, a37 v_accvgpr_read_b32 v22, a38 v_accvgpr_read_b32 v23, a39 v_accvgpr_read_b32 v24, a40 v_accvgpr_read_b32 v25, a41 v_accvgpr_read_b32 v26, a42 v_accvgpr_read_b32 v27, a43 v_accvgpr_read_b32 v28, a44 v_accvgpr_read_b32 v29, a45 v_accvgpr_read_b32 v30, a46 v_accvgpr_read_b32 v31, a47 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_30 ; %bb.29: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i902.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v9, s[8:11], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 s_waitcnt lgkmcnt(2) buffer_store_dwordx4 v[4:7], v13, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s41, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[8:11], 0 offen .LBB0_30: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i950.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a16 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a17 v_accvgpr_read_b32 v2, a18 v_accvgpr_read_b32 v3, a19 v_accvgpr_read_b32 v4, a20 v_accvgpr_read_b32 v5, a21 v_accvgpr_read_b32 v6, a22 v_accvgpr_read_b32 v7, a23 v_accvgpr_read_b32 v8, a24 v_accvgpr_read_b32 v9, a25 v_accvgpr_read_b32 v10, a26 v_accvgpr_read_b32 v11, a27 v_accvgpr_read_b32 v12, a28 v_accvgpr_read_b32 v13, a29 v_accvgpr_read_b32 v14, a30 v_accvgpr_read_b32 v15, a31 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_32 ; %bb.31: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i81.i.i.i1002.i.i.i v_lshlrev_b32_e32 v24, 1, v50 ds_read_b128 v[16:19], v24 ds_read_b128 v[20:23], v24 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v25, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v25, s[8:11], 0 offen ds_read_b128 v[16:19], v24 offset:144 ds_read_b128 v[24:27], v24 offset:128 v_add_u32_e32 v28, 8, v49 v_lshlrev_b32_e32 v29, 1, v28 s_waitcnt lgkmcnt(2) buffer_store_dwordx4 v[20:23], v29, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v20, v28, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v20, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v16, v49, s41, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[24:27], v16, s[8:11], 0 offen .LBB0_32: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_106.i.i.i1050.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v31, a15 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v30, a14 v_accvgpr_read_b32 v29, a13 v_accvgpr_read_b32 v28, a12 v_accvgpr_read_b32 v27, a11 v_accvgpr_read_b32 v26, a10 v_accvgpr_read_b32 v25, a9 v_accvgpr_read_b32 v24, a8 v_accvgpr_read_b32 v23, a7 v_accvgpr_read_b32 v22, a6 v_accvgpr_read_b32 v21, a5 v_accvgpr_read_b32 v20, a4 v_accvgpr_read_b32 v19, a3 v_accvgpr_read_b32 v18, a2 v_accvgpr_read_b32 v17, a1 v_accvgpr_read_b32 v16, a0 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_34 ; %bb.33: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i187.i.i.i1102.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v9, s[8:11], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 s_waitcnt lgkmcnt(2) buffer_store_dwordx4 v[4:7], v13, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s41, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[8:11], 0 offen .LBB0_34: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_212.i.i.i1150.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v16 v_cvt_f16_f32_e32 v1, v17 v_cvt_f16_f32_e32 v2, v18 v_cvt_f16_f32_e32 v3, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v23 v_cvt_f16_f32_e32 v1, v22 v_cvt_f16_f32_e32 v2, v21 v_cvt_f16_f32_e32 v3, v20 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v24 v_cvt_f16_f32_e32 v1, v25 v_cvt_f16_f32_e32 v2, v26 v_cvt_f16_f32_e32 v3, v27 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v31 v_cvt_f16_f32_e32 v1, v30 v_cvt_f16_f32_e32 v2, v29 v_cvt_f16_f32_e32 v3, v28 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB0_36 ; %bb.35: v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_lshl_b32 s10, s33, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v9, s[8:11], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 s_waitcnt lgkmcnt(2) buffer_store_dwordx4 v[4:7], v13, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s41, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s41, 1 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[8:11], 0 offen .LBB0_36: ; %_ZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_IJ s_endpgm .section .rodata,#alloc .p2align 6 .amdhsa_kernel _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .amdhsa_group_segment_fixed_size 34816 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 544 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 256 .amdhsa_next_free_sgpr 48 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_,#alloc,#execinstr .Lfunc_end0: .size _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_, .Lfunc_end0-_ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 15304 ; NumSgprs: 50 ; NumVgprs: 93 ; NumAgprs: 256 ; TotalNumVgprs: 256 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 34816 bytes/workgroup (compile time only) ; SGPRBlocks: 6 ; VGPRBlocks: 63 ; NumSGPRsForWavesPerEU: 50 ; NumVGPRsForWavesPerEU: 256 ; Occupancy: 1 ; WaveLimiterHint : 0 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 .section .text._ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_,#alloc,#execinstr .protected _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ ; -- Begin function _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .globl _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .p2align 8 .type _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_,@function _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_: ; @_ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ ; %bb.0: s_load_dwordx2 s[12:13], s[4:5], 0x0 s_load_dwordx2 s[16:17], s[4:5], 0x8 s_load_dwordx2 s[8:9], s[4:5], 0x10 s_load_dwordx2 s[2:3], s[4:5], 0x24 s_load_dword s44, s[4:5], 0x48 s_load_dword s10, s[4:5], 0x50 s_load_dword s11, s[4:5], 0x58 s_load_dwordx2 s[40:41], s[4:5], 0x6c s_load_dword s7, s[4:5], 0x84 s_load_dwordx4 s[20:23], s[4:5], 0x1e0 s_load_dwordx4 s[24:27], s[4:5], 0x1f4 s_load_dwordx4 s[36:39], s[4:5], 0x208 s_load_dwordx2 s[28:29], s[4:5], 0x120 s_load_dwordx2 s[30:31], s[4:5], 0x12c s_load_dwordx2 s[0:1], s[4:5], 0x13c s_load_dwordx2 s[18:19], s[4:5], 0x148 s_load_dword s33, s[4:5], 0x1d4 s_waitcnt lgkmcnt(0) s_mul_hi_u32 s14, s27, s6 s_add_i32 s14, s6, s14 s_lshr_b32 s14, s14, s39 s_mul_i32 s15, s14, s23 s_sub_i32 s6, s6, s15 s_mul_hi_u32 s15, s14, s26 s_add_i32 s15, s14, s15 s_lshr_b32 s23, s15, s38 s_mul_i32 s15, s23, s22 s_sub_i32 s14, s14, s15 s_mul_hi_u32 s15, s23, s25 s_add_i32 s15, s23, s15 s_lshr_b32 s15, s15, s37 s_mul_hi_u32 s22, s15, s24 s_add_i32 s22, s15, s22 s_lshr_b32 s22, s22, s36 v_lshrrev_b32_e32 v1, 5, v0 v_lshrrev_b32_e32 v29, 7, v0 v_mad_i32_i24 v30, v29, -4, v1 s_mul_i32 s24, s22, s41 v_add_u32_e32 v58, s24, v30 v_mul_hi_u32 v2, v58, s10 s_load_dword s25, s[4:5], 0x1c4 s_load_dword s41, s[4:5], 0x1b0 s_mul_i32 s20, s22, s20 v_add_u32_e32 v2, v58, v2 v_lshrrev_b32_e32 v31, s11, v2 v_mul_lo_u32 v2, v31, s44 v_lshlrev_b32_e32 v37, 2, v29 s_mul_i32 s21, s15, s21 s_sub_i32 s15, s15, s20 v_sub_u32_e32 v38, v58, v2 v_lshl_or_b32 v2, v31, 3, v37 s_waitcnt lgkmcnt(0) s_mul_i32 s42, s15, s25 s_movk_i32 s15, 0xffe0 v_mul_lo_u32 v2, v2, s2 v_mul_lo_u32 v3, v38, s3 s_add_i32 s42, s42, s14 v_mad_i32_i24 v39, v1, s15, v0 s_lshl_b32 s14, s42, 8 v_lshlrev_b32_e32 v17, 3, v39 v_add_u32_e32 v1, s14, v17 v_add3_u32 v1, v1, v2, v3 s_lshl_b32 s14, s7, 1 s_mov_b32 s15, 0x20000 v_lshlrev_b32_e32 v9, 1, v1 v_add_u32_e32 v10, s2, v1 v_lshlrev_b32_e32 v11, 1, v10 buffer_load_dwordx4 v[1:4], v9, s[12:15], 0 offen buffer_load_dwordx4 v[5:8], v11, s[12:15], 0 offen v_add_u32_e32 v9, s2, v10 v_lshlrev_b32_e32 v18, 1, v9 v_add_u32_e32 v40, s2, v9 v_lshlrev_b32_e32 v19, 1, v40 buffer_load_dwordx4 v[9:12], v18, s[12:15], 0 offen buffer_load_dwordx4 v[13:16], v19, s[12:15], 0 offen s_sub_i32 s7, s23, s21 s_mul_i32 s7, s7, s33 s_add_i32 s6, s6, s7 s_lshl_b32 s43, s6, 8 s_load_dwordx2 s[20:21], s[4:5], 0x154 s_load_dword s6, s[4:5], 0x16c v_add_u32_e32 v17, s43, v17 v_mul_hi_u32 v18, v17, s19 s_load_dword s19, s[4:5], 0x180 s_load_dword s33, s[4:5], 0x18c v_accvgpr_write_b32 a240, 0 s_waitcnt lgkmcnt(0) s_mul_i32 s22, s22, s6 v_add_u32_e32 v57, s22, v30 v_mul_hi_u32 v20, v57, s29 v_add_u32_e32 v18, v17, v18 v_lshrrev_b32_e32 v18, s21, v18 v_mul_hi_u32 v19, v18, s18 v_add_u32_e32 v20, v57, v20 v_lshrrev_b32_e32 v20, s31, v20 s_load_dwordx2 s[6:7], s[4:5], 0xbc s_load_dwordx2 s[34:35], s[4:5], 0xd4 s_load_dwordx2 s[36:37], s[4:5], 0xe4 s_load_dwordx2 s[38:39], s[4:5], 0x114 v_mul_hi_u32 v21, v20, s28 v_add_u32_e32 v19, v18, v19 v_lshrrev_b32_e32 v19, s20, v19 v_mul_lo_u32 v22, v19, s0 v_add_u32_e32 v21, v20, v21 s_waitcnt lgkmcnt(0) v_mul_lo_u32 v23, v20, s39 v_lshrrev_b32_e32 v41, s30, v21 v_mul_lo_u32 v21, v41, s38 v_sub_u32_e32 v22, v18, v22 v_sub_u32_e32 v44, v57, v23 s_load_dwordx4 s[20:23], s[4:5], 0x98 s_load_dwordx4 s[24:27], s[4:5], 0xac v_sub_u32_e32 v46, v20, v21 v_mul_lo_u32 v20, v22, s36 v_mul_lo_u32 v21, v44, s37 v_mul_lo_u32 v19, v19, s34 v_mul_lo_u32 v22, v46, s35 v_mul_lo_u32 v18, v18, s1 v_add_u32_e32 v52, v21, v20 v_lshl_or_b32 v23, v41, 3, v37 v_add_u32_e32 v53, v22, v19 v_subrev_u32_e32 v19, s6, v52 s_waitcnt lgkmcnt(0) v_subrev_u32_e32 v20, s25, v53 v_mul_lo_u32 v19, v19, s22 s_sub_i32 s5, s27, s7 v_mul_lo_u32 v21, v23, s20 v_mul_lo_u32 v20, v20, s21 v_cmp_le_i32_e32 vcc, s6, v52 v_cmp_gt_i32_e64 s[0:1], s5, v52 s_sub_i32 s7, s24, s26 s_and_b64 s[46:47], vcc, s[0:1] v_cmp_le_i32_e32 vcc, s25, v53 v_cmp_gt_i32_e64 s[0:1], s7, v53 v_sub_u32_e32 v17, v17, v18 s_and_b64 s[0:1], vcc, s[0:1] s_brev_b32 s23, -2 v_add_u32_e32 v17, v17, v19 v_mov_b32_e32 v54, s23 s_and_b64 s[0:1], s[46:47], s[0:1] v_add3_u32 v17, v17, v21, v20 v_cndmask_b32_e64 v25, v54, 0, s[0:1] s_lshl_b32 s18, s19, 1 s_mov_b32 s19, s15 v_lshl_add_u32 v26, v17, 1, v25 v_add_u32_e32 v27, s20, v17 v_lshl_add_u32 v28, v27, 1, v25 buffer_load_dwordx4 v[17:20], v26, s[16:19], 0 offen buffer_load_dwordx4 v[21:24], v28, s[16:19], 0 offen v_add_u32_e32 v26, s20, v27 v_lshl_add_u32 v42, v26, 1, v25 v_add_u32_e32 v55, s20, v26 v_lshl_add_u32 v43, v55, 1, v25 buffer_load_dwordx4 v[25:28], v42, s[16:19], 0 offen buffer_load_dwordx4 v[33:36], v43, s[16:19], 0 offen s_movk_i32 s0, 0x880 s_waitcnt vmcnt(6) ;;#ASMSTART v_pack_b32_f16 v48, v1, v5 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v50, v1, v5, op_sel:[1, 1] ;;#ASMEND s_waitcnt vmcnt(4) ;;#ASMSTART v_pack_b32_f16 v49, v9, v13 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v51, v9, v13, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v1, v2, v6 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v5, v2, v6, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v2, v10, v14 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v6, v10, v14, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v9, v3, v7 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v13, v3, v7, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v10, v11, v15 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v14, v11, v15, op_sel:[1, 1] ;;#ASMEND v_mul_lo_u32 v11, v30, s0 ;;#ASMSTART v_pack_b32_f16 v3, v4, v8 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v7, v4, v8, op_sel:[1, 1] ;;#ASMEND s_movk_i32 s24, 0x44 v_and_b32_e32 v4, 63, v0 v_and_b32_e32 v8, 32, v0 v_mul_lo_u32 v15, v39, s24 v_sub_u32_e32 v39, v4, v8 v_lshrrev_b32_e32 v4, 4, v0 v_or_b32_e32 v11, v11, v37 v_and_b32_e32 v37, 2, v4 ;;#ASMSTART v_pack_b32_f16 v4, v12, v16 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v8, v12, v16, op_sel:[1, 1] ;;#ASMEND v_add_u32_e32 v12, 4, v58 v_mul_hi_u32 v16, v12, s10 v_lshlrev_b32_e32 v30, 5, v29 v_add_u32_e32 v56, v39, v30 v_ashrrev_i16_e32 v42, 15, v56 v_lshrrev_b16_e32 v42, 13, v42 v_add_u32_e32 v16, v12, v16 v_add_u16_e32 v59, v56, v42 v_lshrrev_b32_e32 v42, s11, v16 v_mul_lo_u32 v16, v42, s44 v_mul_u32_u24_e32 v60, s0, v37 v_lshrrev_b32_e32 v37, 6, v0 v_mad_i32_i24 v45, v29, -2, v37 v_add_u32_e32 v37, 4, v57 v_sub_u32_e32 v43, v12, v16 v_sub_u32_e32 v16, v42, v31 v_mul_hi_u32 v31, v37, s29 v_sub_u32_e32 v12, v43, v38 v_lshl_add_u32 v16, v16, 3, -3 v_mul_lo_u32 v16, v16, s2 v_add_u32_e32 v31, v37, v31 v_lshrrev_b32_e32 v38, s31, v31 v_mul_lo_u32 v12, v12, s3 v_mul_hi_u32 v47, v38, s28 v_lshl_add_u32 v31, v45, 5, v39 v_ashrrev_i32_e32 v39, 31, v31 v_add3_u32 v61, v12, v16, v40 v_add_u32_e32 v16, v38, v47 v_mul_lo_u32 v12, v38, s39 v_lshrrev_b32_e32 v45, s30, v16 v_mul_lo_u32 v16, v45, s38 v_lshrrev_b32_e32 v39, 29, v39 v_sub_u32_e32 v47, v37, v12 v_sub_u32_e32 v12, v47, v44 v_sub_u32_e32 v44, v38, v16 v_sub_u32_e32 v16, v44, v46 v_mul_lo_u32 v12, v12, s37 v_mul_lo_u32 v16, v16, s35 v_sub_u32_e32 v38, v45, v41 v_lshl_add_u32 v38, v38, 3, -3 v_mul_lo_u32 v38, v38, s20 v_mul_lo_u32 v40, v12, s22 v_mul_lo_u32 v41, v16, s21 v_add_u32_e32 v39, v31, v39 v_ashrrev_i32_e32 v46, 3, v39 v_add_u32_e32 v38, v38, v40 v_add3_u32 v41, v38, v41, v55 v_mul_lo_u32 v38, v46, s24 v_and_b32_e32 v39, -8, v39 v_sub_u32_e32 v62, v31, v39 v_lshlrev_b32_e32 v39, 3, v62 v_add3_u32 v63, v38, v60, v39 v_ashrrev_i16_e32 v38, 3, v59 v_bfe_i32 v64, v38, 0, 16 v_and_b32_e32 v38, -8, v59 v_sub_u16_e32 v38, v56, v38 v_bfe_i32 v59, v38, 0, 16 v_add_u32_e32 v38, v12, v52 v_add_u32_e32 v39, v16, v53 v_add_lshl_u32 v40, v11, v15, 1 v_cmp_le_i32_e32 vcc, s6, v38 v_cmp_gt_i32_e64 s[0:1], s5, v38 ds_write2_b64 v40, v[48:49], v[50:51] offset1:2 ds_write2_b64 v40, v[1:2], v[5:6] offset0:4 offset1:6 ds_write2_b64 v40, v[9:10], v[13:14] offset0:8 offset1:10 ds_write2_b64 v40, v[3:4], v[7:8] offset0:12 offset1:14 s_waitcnt vmcnt(2) ;;#ASMSTART v_pack_b32_f16 v1, v17, v21 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v3, v17, v21, op_sel:[1, 1] ;;#ASMEND s_waitcnt vmcnt(0) ;;#ASMSTART v_pack_b32_f16 v2, v25, v33 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v4, v25, v33, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v5, v18, v22 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v7, v18, v22, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v6, v26, v34 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v8, v26, v34, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v9, v19, v23 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v11, v19, v23, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v10, v27, v35 ;;#ASMEND v_add_u32_e32 v17, 0x4000, v40 s_and_b64 s[26:27], vcc, s[0:1] v_cmp_le_i32_e32 vcc, s25, v39 v_cmp_gt_i32_e64 s[0:1], s7, v39 ;;#ASMSTART v_pack_b32_f16 v12, v27, v35, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v13, v20, v24 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v15, v20, v24, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v14, v28, v36 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v16, v28, v36, op_sel:[1, 1] ;;#ASMEND ds_write2_b64 v17, v[1:2], v[3:4] offset0:128 offset1:130 ds_write2_b64 v17, v[5:6], v[7:8] offset0:132 offset1:134 ds_write2_b64 v17, v[9:10], v[11:12] offset0:136 offset1:138 ds_write2_b64 v17, v[13:14], v[15:16] offset0:140 offset1:142 v_lshlrev_b32_e32 v9, 1, v61 v_add_u32_e32 v10, s2, v61 s_and_b64 s[0:1], vcc, s[0:1] v_lshlrev_b32_e32 v11, 1, v10 buffer_load_dwordx4 v[1:4], v9, s[12:15], 0 offen buffer_load_dwordx4 v[5:8], v11, s[12:15], 0 offen v_add_u32_e32 v9, s2, v10 s_and_b64 s[0:1], s[26:27], s[0:1] v_add_u32_e32 v49, s2, v9 v_cndmask_b32_e64 v25, v54, 0, s[0:1] v_lshlrev_b32_e32 v17, 1, v9 v_lshlrev_b32_e32 v18, 1, v49 v_lshl_add_u32 v26, v41, 1, v25 v_add_u32_e32 v27, s20, v41 buffer_load_dwordx4 v[9:12], v17, s[12:15], 0 offen buffer_load_dwordx4 v[13:16], v18, s[12:15], 0 offen v_lshl_add_u32 v28, v27, 1, v25 buffer_load_dwordx4 v[17:20], v26, s[16:19], 0 offen buffer_load_dwordx4 v[21:24], v28, s[16:19], 0 offen v_add_u32_e32 v26, s20, v27 v_add_u32_e32 v55, s20, v26 v_lshl_add_u32 v48, v26, 1, v25 v_lshl_add_u32 v50, v55, 1, v25 v_add_u32_e32 v25, 64, v56 v_lshrrev_b32_e32 v25, 3, v25 v_sub_u32_e32 v25, v25, v64 v_mul_lo_u32 v25, v25, s24 s_movk_i32 s0, 0x80 s_movk_i32 s1, 0xc0 v_and_b32_e32 v26, 7, v56 v_add_u32_e32 v27, s0, v56 v_add_u32_e32 v28, s1, v56 v_sub_u32_e32 v26, v26, v59 v_lshrrev_b32_e32 v27, 3, v27 v_lshrrev_b32_e32 v28, 3, v28 v_sub_u32_e32 v27, v27, v64 v_sub_u32_e32 v28, v28, v64 v_lshlrev_b32_e32 v26, 3, v26 v_add_u32_e32 v54, 64, v31 v_mul_lo_u32 v27, v27, s24 v_mul_lo_u32 v28, v28, s24 v_add_u32_e32 v51, v26, v25 v_ashrrev_i32_e32 v25, 31, v54 v_lshrrev_b32_e32 v25, 29, v25 v_add_u32_e32 v56, v54, v25 v_lshlrev_b32_e32 v52, 3, v59 v_mad_i32_i24 v60, v64, s24, v60 v_ashrrev_i32_e32 v25, 3, v56 v_add_lshl_u32 v41, v60, v52, 1 v_add_u32_e32 v52, v26, v27 v_add_u32_e32 v53, v26, v28 v_sub_u32_e32 v59, v25, v46 buffer_load_dwordx4 v[25:28], v48, s[16:19], 0 offen buffer_load_dwordx4 v[33:36], v50, s[16:19], 0 offen s_mov_b32 s26, 0xffffff8 v_mul_lo_u32 v48, v59, s24 v_and_b32_e32 v50, s26, v56 v_sub_u32_e32 v50, v54, v50 v_sub_u32_e32 v50, v50, v62 v_lshl_add_u32 v59, v50, 3, v48 v_add_u32_e32 v48, s0, v31 v_ashrrev_i32_e32 v50, 31, v48 v_lshrrev_b32_e32 v50, 29, v50 v_add_u32_e32 v50, v48, v50 v_ashrrev_i32_e32 v54, 3, v50 v_sub_u32_e32 v54, v54, v46 v_mul_lo_u32 v54, v54, s24 v_and_b32_e32 v50, s26, v50 v_sub_u32_e32 v48, v48, v50 v_sub_u32_e32 v48, v48, v62 v_lshl_add_u32 v60, v48, 3, v54 v_add_u32_e32 v48, s1, v31 v_ashrrev_i32_e32 v50, 31, v48 v_lshrrev_b32_e32 v50, 29, v50 v_add_u32_e32 v50, v48, v50 v_ashrrev_i32_e32 v54, 3, v50 v_sub_u32_e32 v61, v54, v46 v_and_b32_e32 v46, s26, v50 v_lshl_add_u32 v50, v51, 1, v41 v_mul_lo_u32 v51, v61, s24 s_movk_i32 s0, 0x4400 v_sub_u32_e32 v64, v48, v46 v_lshl_add_u32 v54, v63, 1, s0 v_lshl_add_u32 v46, v53, 1, v41 v_lshl_add_u32 v53, v59, 1, v54 v_sub_u32_e32 v59, v64, v62 v_accvgpr_write_b32 a241, 0 v_accvgpr_write_b32 a242, 0 v_accvgpr_write_b32 a243, 0 v_accvgpr_write_b32 a244, 0 v_accvgpr_write_b32 a245, 0 v_accvgpr_write_b32 a246, 0 v_accvgpr_write_b32 a247, 0 v_accvgpr_write_b32 a248, 0 v_accvgpr_write_b32 a249, 0 v_accvgpr_write_b32 a250, 0 v_accvgpr_write_b32 a251, 0 v_accvgpr_write_b32 a252, 0 v_accvgpr_write_b32 a253, 0 v_accvgpr_write_b32 a254, 0 v_accvgpr_write_b32 a255, 0 v_accvgpr_write_b32 a224, 0 v_accvgpr_write_b32 a208, 0 v_accvgpr_write_b32 a192, 0 v_accvgpr_write_b32 a128, 0 v_accvgpr_write_b32 a144, 0 v_accvgpr_write_b32 a160, 0 v_accvgpr_write_b32 a176, 0 v_accvgpr_write_b32 a112, 0 v_accvgpr_write_b32 a96, 0 v_accvgpr_write_b32 a80, 0 v_accvgpr_write_b32 a64, 0 v_accvgpr_write_b32 a0, 0 v_accvgpr_write_b32 a16, 0 v_accvgpr_write_b32 a32, 0 v_accvgpr_write_b32 a48, 0 v_lshl_add_u32 v51, v59, 3, v51 v_add_u32_e32 v57, 8, v57 v_add_u32_e32 v58, 8, v58 v_accvgpr_write_b32 a225, 0 v_accvgpr_write_b32 a226, 0 v_accvgpr_write_b32 a227, 0 v_accvgpr_write_b32 a228, 0 v_accvgpr_write_b32 a229, 0 v_accvgpr_write_b32 a230, 0 v_accvgpr_write_b32 a231, 0 v_accvgpr_write_b32 a232, 0 v_accvgpr_write_b32 a233, 0 v_accvgpr_write_b32 a234, 0 v_accvgpr_write_b32 a235, 0 v_accvgpr_write_b32 a236, 0 v_accvgpr_write_b32 a237, 0 v_accvgpr_write_b32 a238, 0 v_accvgpr_write_b32 a239, 0 v_accvgpr_write_b32 a209, 0 v_accvgpr_write_b32 a210, 0 v_accvgpr_write_b32 a211, 0 v_accvgpr_write_b32 a212, 0 v_accvgpr_write_b32 a213, 0 v_accvgpr_write_b32 a214, 0 v_accvgpr_write_b32 a215, 0 v_accvgpr_write_b32 a216, 0 v_accvgpr_write_b32 a217, 0 v_accvgpr_write_b32 a218, 0 v_accvgpr_write_b32 a219, 0 v_accvgpr_write_b32 a220, 0 v_accvgpr_write_b32 a221, 0 v_accvgpr_write_b32 a222, 0 v_accvgpr_write_b32 a223, 0 v_accvgpr_write_b32 a193, 0 v_accvgpr_write_b32 a194, 0 v_accvgpr_write_b32 a195, 0 v_accvgpr_write_b32 a196, 0 v_accvgpr_write_b32 a197, 0 v_accvgpr_write_b32 a198, 0 v_accvgpr_write_b32 a199, 0 v_accvgpr_write_b32 a200, 0 v_accvgpr_write_b32 a201, 0 v_accvgpr_write_b32 a202, 0 v_accvgpr_write_b32 a203, 0 v_accvgpr_write_b32 a204, 0 v_accvgpr_write_b32 a205, 0 v_accvgpr_write_b32 a206, 0 v_accvgpr_write_b32 a207, 0 v_accvgpr_write_b32 a129, 0 v_accvgpr_write_b32 a130, 0 v_accvgpr_write_b32 a131, 0 v_accvgpr_write_b32 a132, 0 v_accvgpr_write_b32 a133, 0 v_accvgpr_write_b32 a134, 0 v_accvgpr_write_b32 a135, 0 v_accvgpr_write_b32 a136, 0 v_accvgpr_write_b32 a137, 0 v_accvgpr_write_b32 a138, 0 v_accvgpr_write_b32 a139, 0 v_accvgpr_write_b32 a140, 0 v_accvgpr_write_b32 a141, 0 v_accvgpr_write_b32 a142, 0 v_accvgpr_write_b32 a143, 0 v_accvgpr_write_b32 a145, 0 v_accvgpr_write_b32 a146, 0 v_accvgpr_write_b32 a147, 0 v_accvgpr_write_b32 a148, 0 v_accvgpr_write_b32 a149, 0 v_accvgpr_write_b32 a150, 0 v_accvgpr_write_b32 a151, 0 v_accvgpr_write_b32 a152, 0 v_accvgpr_write_b32 a153, 0 v_accvgpr_write_b32 a154, 0 v_accvgpr_write_b32 a155, 0 v_accvgpr_write_b32 a156, 0 v_accvgpr_write_b32 a157, 0 v_accvgpr_write_b32 a158, 0 v_accvgpr_write_b32 a159, 0 v_accvgpr_write_b32 a161, 0 v_accvgpr_write_b32 a162, 0 v_accvgpr_write_b32 a163, 0 v_accvgpr_write_b32 a164, 0 v_accvgpr_write_b32 a165, 0 v_accvgpr_write_b32 a166, 0 v_accvgpr_write_b32 a167, 0 v_accvgpr_write_b32 a168, 0 v_accvgpr_write_b32 a169, 0 v_accvgpr_write_b32 a170, 0 v_accvgpr_write_b32 a171, 0 v_accvgpr_write_b32 a172, 0 v_accvgpr_write_b32 a173, 0 v_accvgpr_write_b32 a174, 0 v_accvgpr_write_b32 a175, 0 v_accvgpr_write_b32 a177, 0 v_accvgpr_write_b32 a178, 0 v_accvgpr_write_b32 a179, 0 v_accvgpr_write_b32 a180, 0 v_accvgpr_write_b32 a181, 0 v_accvgpr_write_b32 a182, 0 v_accvgpr_write_b32 a183, 0 v_accvgpr_write_b32 a184, 0 v_accvgpr_write_b32 a185, 0 v_accvgpr_write_b32 a186, 0 v_accvgpr_write_b32 a187, 0 v_accvgpr_write_b32 a188, 0 v_accvgpr_write_b32 a189, 0 v_accvgpr_write_b32 a190, 0 v_accvgpr_write_b32 a191, 0 v_accvgpr_write_b32 a113, 0 v_accvgpr_write_b32 a114, 0 v_accvgpr_write_b32 a115, 0 v_accvgpr_write_b32 a116, 0 v_accvgpr_write_b32 a117, 0 v_accvgpr_write_b32 a118, 0 v_accvgpr_write_b32 a119, 0 v_accvgpr_write_b32 a120, 0 v_accvgpr_write_b32 a121, 0 v_accvgpr_write_b32 a122, 0 v_accvgpr_write_b32 a123, 0 v_accvgpr_write_b32 a124, 0 v_accvgpr_write_b32 a125, 0 v_accvgpr_write_b32 a126, 0 v_accvgpr_write_b32 a127, 0 v_accvgpr_write_b32 a97, 0 v_accvgpr_write_b32 a98, 0 v_accvgpr_write_b32 a99, 0 v_accvgpr_write_b32 a100, 0 v_accvgpr_write_b32 a101, 0 v_accvgpr_write_b32 a102, 0 v_accvgpr_write_b32 a103, 0 v_accvgpr_write_b32 a104, 0 v_accvgpr_write_b32 a105, 0 v_accvgpr_write_b32 a106, 0 v_accvgpr_write_b32 a107, 0 v_accvgpr_write_b32 a108, 0 v_accvgpr_write_b32 a109, 0 v_accvgpr_write_b32 a110, 0 v_accvgpr_write_b32 a111, 0 v_accvgpr_write_b32 a81, 0 v_accvgpr_write_b32 a82, 0 v_accvgpr_write_b32 a83, 0 v_accvgpr_write_b32 a84, 0 v_accvgpr_write_b32 a85, 0 v_accvgpr_write_b32 a86, 0 v_accvgpr_write_b32 a87, 0 v_accvgpr_write_b32 a88, 0 v_accvgpr_write_b32 a89, 0 v_accvgpr_write_b32 a90, 0 v_accvgpr_write_b32 a91, 0 v_accvgpr_write_b32 a92, 0 v_accvgpr_write_b32 a93, 0 v_accvgpr_write_b32 a94, 0 v_accvgpr_write_b32 a95, 0 v_accvgpr_write_b32 a65, 0 v_accvgpr_write_b32 a66, 0 v_accvgpr_write_b32 a67, 0 v_accvgpr_write_b32 a68, 0 v_accvgpr_write_b32 a69, 0 v_accvgpr_write_b32 a70, 0 v_accvgpr_write_b32 a71, 0 v_accvgpr_write_b32 a72, 0 v_accvgpr_write_b32 a73, 0 v_accvgpr_write_b32 a74, 0 v_accvgpr_write_b32 a75, 0 v_accvgpr_write_b32 a76, 0 v_accvgpr_write_b32 a77, 0 v_accvgpr_write_b32 a78, 0 v_accvgpr_write_b32 a79, 0 v_accvgpr_write_b32 a1, 0 v_accvgpr_write_b32 a2, 0 v_accvgpr_write_b32 a3, 0 v_accvgpr_write_b32 a4, 0 v_accvgpr_write_b32 a5, 0 v_accvgpr_write_b32 a6, 0 v_accvgpr_write_b32 a7, 0 v_accvgpr_write_b32 a8, 0 v_accvgpr_write_b32 a9, 0 v_accvgpr_write_b32 a10, 0 v_accvgpr_write_b32 a11, 0 v_accvgpr_write_b32 a12, 0 v_accvgpr_write_b32 a13, 0 v_accvgpr_write_b32 a14, 0 v_accvgpr_write_b32 a15, 0 v_accvgpr_write_b32 a17, 0 v_accvgpr_write_b32 a18, 0 v_accvgpr_write_b32 a19, 0 v_accvgpr_write_b32 a20, 0 v_accvgpr_write_b32 a21, 0 v_accvgpr_write_b32 a22, 0 v_accvgpr_write_b32 a23, 0 v_accvgpr_write_b32 a24, 0 v_accvgpr_write_b32 a25, 0 v_accvgpr_write_b32 a26, 0 v_accvgpr_write_b32 a27, 0 v_accvgpr_write_b32 a28, 0 v_accvgpr_write_b32 a29, 0 v_accvgpr_write_b32 a30, 0 v_accvgpr_write_b32 a31, 0 v_accvgpr_write_b32 a33, 0 v_accvgpr_write_b32 a34, 0 v_accvgpr_write_b32 a35, 0 v_accvgpr_write_b32 a36, 0 v_accvgpr_write_b32 a37, 0 v_accvgpr_write_b32 a38, 0 v_accvgpr_write_b32 a39, 0 v_accvgpr_write_b32 a40, 0 v_accvgpr_write_b32 a41, 0 v_accvgpr_write_b32 a42, 0 v_accvgpr_write_b32 a43, 0 v_accvgpr_write_b32 a44, 0 v_accvgpr_write_b32 a45, 0 v_accvgpr_write_b32 a46, 0 v_accvgpr_write_b32 a47, 0 v_accvgpr_write_b32 a49, 0 v_accvgpr_write_b32 a50, 0 v_accvgpr_write_b32 a51, 0 v_accvgpr_write_b32 a52, 0 v_accvgpr_write_b32 a53, 0 v_accvgpr_write_b32 a54, 0 v_accvgpr_write_b32 a55, 0 v_accvgpr_write_b32 a56, 0 v_accvgpr_write_b32 a57, 0 v_accvgpr_write_b32 a58, 0 v_accvgpr_write_b32 a59, 0 v_accvgpr_write_b32 a60, 0 v_accvgpr_write_b32 a61, 0 v_accvgpr_write_b32 a62, 0 v_accvgpr_write_b32 a63, 0 s_mov_b32 s34, 0 s_mov_b32 s4, s29 v_add_u32_e32 v56, s0, v40 v_lshl_add_u32 v48, v52, 1, v41 v_lshl_add_u32 v52, v60, 1, v54 v_lshl_add_u32 v51, v51, 1, v54 s_add_i32 s24, s40, -4 s_sub_i32 s26, 0, s39 s_sub_i32 s27, 0, s44 s_movk_i32 s29, 0x1000 v_mov_b32_e32 v59, v58 v_mov_b32_e32 v60, v57 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND .LBB1_1: ; %_ZZN2ck22move_tensor_coordinateINS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS2_IJiiiEEELb0EEENS3_INS2_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESB_NS_23Merge_v2_magic_divisionINS2_IJiiEEEEESB_NSA_IS7_EENS3_ISD_Lb0EEESB_SF_EEENS2_IJNS_8SequenceIJLi0EEEENSI_IJLi1EEEENSI_IJLi2EEEENSI_IJLi3EEEENSI_IJLi4ELi6EEEENSI_IJLi7EEEENSI_IJLi5EEEENSI_IJLi8EEEENSI_IJLi9EEEENSI_IJLi10EEEEEEENS2_IJNSI_IJLi1ELi2ELi3EEEENSI_IJLi4ELi5EEEENSI_IJLi6EEEESO_SQ_SR_SS_NSI_IJLi11ELi12EEEENSI_IJLi13EEEENSI_IJLi14EEEEEEENSI_IJLi11ELi12ELi13ELi14EEEEiEENS_16TensorCoordinateILi15EKS11_EENS_20TensorCoordinateStepILi10ELi4ENSI_IJLi0ELi0ELi0ELi0ELi0ELi0ELi0ELi0ELi0ELi0EEEEEEEEvRKT_RT0_RKT1_ENKUlS19_E_clINS6_IiLi9EEEEEDaS19_.exit.i.i.i.i.i244.i ; =>This Inner Loop Header: Depth=1 ds_read2_b64 v[61:64], v41 offset1:1 ds_read2_b64 v[65:68], v54 offset1:1 ds_read2_b64 v[69:72], v53 offset1:1 ds_read2_b64 v[73:76], v52 offset1:1 ds_read2_b64 v[77:80], v51 offset1:1 ds_read2_b64 v[81:84], v50 offset1:1 s_waitcnt lgkmcnt(4) v_mfma_f32_32x32x8f16 a[240:255], v[61:62], v[65:66], a[240:255] ds_read2_b64 v[85:88], v48 offset1:1 ds_read2_b64 v[89:92], v46 offset1:1 v_add_u32_e32 v37, 4, v37 s_waitcnt lgkmcnt(5) v_mfma_f32_32x32x8f16 a[224:239], v[61:62], v[69:70], a[224:239] s_waitcnt lgkmcnt(4) v_mfma_f32_32x32x8f16 a[208:223], v[61:62], v[73:74], a[208:223] s_waitcnt lgkmcnt(3) v_mfma_f32_32x32x8f16 a[192:207], v[61:62], v[77:78], a[192:207] v_add_u32_e32 v61, s29, v41 s_waitcnt lgkmcnt(2) v_mfma_f32_32x32x8f16 a[128:143], v[81:82], v[65:66], a[128:143] v_mfma_f32_32x32x8f16 a[144:159], v[81:82], v[69:70], a[144:159] v_mfma_f32_32x32x8f16 a[160:175], v[81:82], v[73:74], a[160:175] v_mfma_f32_32x32x8f16 a[176:191], v[81:82], v[77:78], a[176:191] s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[112:127], v[85:86], v[65:66], a[112:127] v_mfma_f32_32x32x8f16 a[96:111], v[85:86], v[69:70], a[96:111] v_mfma_f32_32x32x8f16 a[80:95], v[85:86], v[73:74], a[80:95] v_mfma_f32_32x32x8f16 a[64:79], v[85:86], v[77:78], a[64:79] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[0:15], v[89:90], v[65:66], a[0:15] v_add_u32_e32 v65, s29, v54 v_mfma_f32_32x32x8f16 a[16:31], v[89:90], v[69:70], a[16:31] v_add_u32_e32 v69, s29, v53 v_mfma_f32_32x32x8f16 a[32:47], v[89:90], v[73:74], a[32:47] v_add_u32_e32 v73, s29, v52 v_mfma_f32_32x32x8f16 a[48:63], v[89:90], v[77:78], a[48:63] v_add_u32_e32 v77, s29, v51 v_mfma_f32_32x32x8f16 a[240:255], v[63:64], v[67:68], a[240:255] v_mfma_f32_32x32x8f16 a[224:239], v[63:64], v[71:72], a[224:239] v_mfma_f32_32x32x8f16 a[208:223], v[63:64], v[75:76], a[208:223] v_mfma_f32_32x32x8f16 a[192:207], v[63:64], v[79:80], a[192:207] v_mfma_f32_32x32x8f16 a[128:143], v[83:84], v[67:68], a[128:143] v_mfma_f32_32x32x8f16 a[144:159], v[83:84], v[71:72], a[144:159] v_mfma_f32_32x32x8f16 a[160:175], v[83:84], v[75:76], a[160:175] v_mfma_f32_32x32x8f16 a[176:191], v[83:84], v[79:80], a[176:191] v_mfma_f32_32x32x8f16 a[112:127], v[87:88], v[67:68], a[112:127] v_mfma_f32_32x32x8f16 a[96:111], v[87:88], v[71:72], a[96:111] v_mfma_f32_32x32x8f16 a[80:95], v[87:88], v[75:76], a[80:95] v_mfma_f32_32x32x8f16 a[64:79], v[87:88], v[79:80], a[64:79] v_mfma_f32_32x32x8f16 a[0:15], v[91:92], v[67:68], a[0:15] ds_read2_b64 v[93:96], v61 offset0:32 offset1:33 ds_read2_b64 v[97:100], v65 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[16:31], v[91:92], v[71:72], a[16:31] ds_read2_b64 v[101:104], v69 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[32:47], v[91:92], v[75:76], a[32:47] ds_read2_b64 v[105:108], v73 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[48:63], v[91:92], v[79:80], a[48:63] ds_read2_b64 v[109:112], v77 offset0:32 offset1:33 s_waitcnt lgkmcnt(3) v_mfma_f32_32x32x8f16 a[240:255], v[93:94], v[97:98], a[240:255] s_waitcnt lgkmcnt(2) v_mfma_f32_32x32x8f16 a[224:239], v[93:94], v[101:102], a[224:239] s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[208:223], v[93:94], v[105:106], a[208:223] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[192:207], v[93:94], v[109:110], a[192:207] v_add_u32_e32 v61, s29, v50 ds_read2_b64 v[113:116], v61 offset0:32 offset1:33 v_add_u32_e32 v61, s29, v48 ds_read2_b64 v[117:120], v61 offset0:32 offset1:33 v_add_u32_e32 v61, s29, v46 ds_read2_b64 v[121:124], v61 offset0:32 offset1:33 s_waitcnt lgkmcnt(2) v_mfma_f32_32x32x8f16 a[128:143], v[113:114], v[97:98], a[128:143] ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[112:127], v[117:118], v[97:98], a[112:127] s_waitcnt vmcnt(6) ;;#ASMSTART v_pack_b32_f16 v61, v1, v5 ;;#ASMEND v_mul_hi_u32 v65, s4, v60 v_mul_hi_u32 v66, s10, v59 v_add_u32_e32 v60, 4, v60 v_add_u32_e32 v59, 4, v59 ;;#ASMSTART v_pack_b32_f16 v63, v1, v5, op_sel:[1, 1] ;;#ASMEND s_waitcnt vmcnt(4) ;;#ASMSTART v_pack_b32_f16 v62, v9, v13 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v64, v9, v13, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v1, v2, v6 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v5, v2, v6, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[0:15], v[121:122], v[97:98], a[0:15] v_pack_b32_f16 v2, v10, v14 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v6, v10, v14, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v9, v3, v7 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v13, v3, v7, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v10, v11, v15 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v14, v11, v15, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v3, v4, v8 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v7, v4, v8, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v4, v12, v16 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v8, v12, v16, op_sel:[1, 1] ;;#ASMEND v_mfma_f32_32x32x8f16 a[240:255], v[95:96], v[99:100], a[240:255] ds_write2_b64 v40, v[61:62], v[63:64] offset1:2 v_mfma_f32_32x32x8f16 a[224:239], v[95:96], v[103:104], a[224:239] ds_write2_b64 v40, v[1:2], v[5:6] offset0:4 offset1:6 v_mfma_f32_32x32x8f16 a[208:223], v[95:96], v[107:108], a[208:223] ds_write2_b64 v40, v[9:10], v[13:14] offset0:8 offset1:10 v_mfma_f32_32x32x8f16 a[192:207], v[95:96], v[111:112], a[192:207] ds_write2_b64 v40, v[3:4], v[7:8] offset0:12 offset1:14 s_waitcnt vmcnt(2) ;;#ASMSTART v_pack_b32_f16 v1, v17, v21 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v3, v17, v21, op_sel:[1, 1] ;;#ASMEND s_waitcnt vmcnt(0) ;;#ASMSTART v_pack_b32_f16 v2, v25, v33 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v4, v25, v33, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_mfma_f32_32x32x8f16 a[144:159], v[113:114], v[101:102], a[144:159] v_pack_b32_f16 v5, v18, v22 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v7, v18, v22, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v6, v26, v34 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v8, v26, v34, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v9, v19, v23 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v11, v19, v23, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v10, v27, v35 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v12, v27, v35, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v13, v20, v24 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v15, v20, v24, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v14, v28, v36 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v16, v28, v36, op_sel:[1, 1] ;;#ASMEND v_mfma_f32_32x32x8f16 a[96:111], v[117:118], v[101:102], a[96:111] ds_write2_b64 v56, v[1:2], v[3:4] offset1:2 v_mfma_f32_32x32x8f16 a[16:31], v[121:122], v[101:102], a[16:31] ds_write2_b64 v56, v[5:6], v[7:8] offset0:4 offset1:6 v_mfma_f32_32x32x8f16 a[160:175], v[113:114], v[105:106], a[160:175] ds_write2_b64 v56, v[9:10], v[11:12] offset0:8 offset1:10 v_mfma_f32_32x32x8f16 a[176:191], v[113:114], v[109:110], a[176:191] ds_write2_b64 v56, v[13:14], v[15:16] offset0:12 offset1:14 v_add3_u32 v1, v58, v66, s34 v_add3_u32 v2, v57, v65, s34 v_lshrrev_b32_e32 v61, s11, v1 v_lshrrev_b32_e32 v62, s31, v2 v_mul_lo_u32 v63, s27, v61 v_mfma_f32_32x32x8f16 a[80:95], v[117:118], v[105:106], a[80:95] v_mul_hi_u32 v3, v62, s28 v_mul_lo_u32 v2, s26, v62 v_sub_u32_e32 v4, v63, v43 v_add_u32_e32 v3, v62, v3 v_sub_u32_e32 v2, v2, v47 v_lshrrev_b32_e32 v64, s30, v3 v_add3_u32 v2, v57, s34, v2 v_mul_lo_u32 v2, v2, s37 v_sub_u32_e32 v1, v61, v42 v_sub_u32_e32 v5, v64, v45 v_lshl_add_u32 v1, v1, 3, -3 v_mfma_f32_32x32x8f16 a[64:79], v[117:118], v[109:110], a[64:79] v_add_u32_e32 v38, v2, v38 v_mul_lo_u32 v9, v2, s22 v_lshl_add_u32 v5, v5, 3, -3 v_mul_lo_u32 v1, v1, s2 v_mul_lo_u32 v17, v5, s20 v_cmp_le_i32_e32 vcc, s6, v38 v_cmp_gt_i32_e64 s[0:1], s5, v38 s_and_b64 s[44:45], vcc, s[0:1] v_add_u32_e32 v19, v9, v55 v_mul_lo_u32 v43, v62, s39 v_mfma_f32_32x32x8f16 a[32:47], v[121:122], v[105:106], a[32:47] v_mov_b32_e32 v42, v61 v_mov_b32_e32 v45, v64 v_sub_u32_e32 v47, v37, v43 v_add_u32_e32 v69, s34, v58 v_add_u32_e32 v4, v69, v4 v_mul_lo_u32 v3, v4, s3 v_mul_lo_u32 v4, v64, s38 v_mov_b32_e32 v70, s23 s_add_i32 s34, s34, 4 v_add3_u32 v1, v1, v49, v3 v_sub_u32_e32 v65, v62, v4 v_sub_u32_e32 v2, v65, v44 v_mfma_f32_32x32x8f16 a[48:63], v[121:122], v[109:110], a[48:63] v_mul_lo_u32 v18, v2, s35 v_lshlrev_b32_e32 v3, 1, v1 v_add_u32_e32 v1, s2, v1 v_add_u32_e32 v10, s2, v1 v_add_u32_e32 v39, v18, v39 v_mul_lo_u32 v18, v18, s21 v_cmp_le_i32_e32 vcc, s25, v39 v_cmp_gt_i32_e64 s[0:1], s7, v39 s_and_b64 s[0:1], vcc, s[0:1] s_and_b64 s[0:1], s[44:45], s[0:1] v_cndmask_b32_e64 v25, v70, 0, s[0:1] v_add3_u32 v17, v19, v17, v18 v_mfma_f32_32x32x8f16 a[128:143], v[115:116], v[99:100], a[128:143] v_lshl_add_u32 v18, v17, 1, v25 v_add_u32_e32 v17, s20, v17 v_add_u32_e32 v26, s20, v17 v_add_u32_e32 v49, s2, v10 v_add_u32_e32 v55, s20, v26 v_lshlrev_b32_e32 v5, 1, v1 v_lshlrev_b32_e32 v11, 1, v10 v_lshlrev_b32_e32 v13, 1, v49 v_lshl_add_u32 v21, v17, 1, v25 v_lshl_add_u32 v27, v26, 1, v25 v_lshl_add_u32 v33, v55, 1, v25 v_mfma_f32_32x32x8f16 a[144:159], v[115:116], v[103:104], a[144:159] buffer_load_dwordx4 v[1:4], v3, s[12:15], 0 offen s_nop 0 v_mfma_f32_32x32x8f16 a[160:175], v[115:116], v[107:108], a[160:175] buffer_load_dwordx4 v[5:8], v5, s[12:15], 0 offen s_nop 0 v_mfma_f32_32x32x8f16 a[176:191], v[115:116], v[111:112], a[176:191] buffer_load_dwordx4 v[9:12], v11, s[12:15], 0 offen s_nop 0 v_mfma_f32_32x32x8f16 a[112:127], v[119:120], v[99:100], a[112:127] buffer_load_dwordx4 v[13:16], v13, s[12:15], 0 offen ;;#ASMSTART s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[96:111], v[119:120], v[103:104], a[96:111] s_barrier ;;#ASMEND v_mfma_f32_32x32x8f16 a[80:95], v[119:120], v[107:108], a[80:95] buffer_load_dwordx4 v[17:20], v18, s[16:19], 0 offen s_nop 0 v_mfma_f32_32x32x8f16 a[64:79], v[119:120], v[111:112], a[64:79] buffer_load_dwordx4 v[21:24], v21, s[16:19], 0 offen v_mfma_f32_32x32x8f16 a[0:15], v[123:124], v[99:100], a[0:15] buffer_load_dwordx4 v[25:28], v27, s[16:19], 0 offen v_add_u32_e32 v44, v69, v63 v_mfma_f32_32x32x8f16 a[16:31], v[123:124], v[103:104], a[16:31] buffer_load_dwordx4 v[33:36], v33, s[16:19], 0 offen v_mov_b32_e32 v43, v44 s_cmp_lt_i32 s34, s24 v_mov_b32_e32 v44, v65 v_mfma_f32_32x32x8f16 a[32:47], v[123:124], v[107:108], a[32:47] v_mfma_f32_32x32x8f16 a[48:63], v[123:124], v[111:112], a[48:63] s_cbranch_scc1 .LBB1_1 ; %bb.2: ; %_ZZN2ck23Merge_v2_magic_divisionINS_5TupleIJNS_17integral_constantIiLi4EEENS2_IiLi2EEEiiiEEEEC1ERKS5_ENKUlT_E_clIS4_EEDaS9_.exit.i.i.i.i.i.i.i.i ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_waitcnt vmcnt(7) ds_read2_b64 v[1:4], v41 offset1:1 s_waitcnt vmcnt(6) ds_read2_b64 v[5:8], v54 offset1:1 s_waitcnt vmcnt(5) ds_read2_b64 v[9:12], v53 offset1:1 s_waitcnt vmcnt(4) ds_read2_b64 v[13:16], v52 offset1:1 s_waitcnt vmcnt(3) ds_read2_b64 v[17:20], v51 offset1:1 s_waitcnt lgkmcnt(3) v_mfma_f32_32x32x8f16 a[240:255], v[1:2], v[5:6], a[240:255] s_movk_i32 s0, 0x1000 s_waitcnt lgkmcnt(2) v_mfma_f32_32x32x8f16 a[224:239], v[1:2], v[9:10], a[224:239] s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[208:223], v[1:2], v[13:14], a[208:223] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[192:207], v[1:2], v[17:18], a[192:207] v_mfma_f32_32x32x8f16 a[240:255], v[3:4], v[7:8], a[240:255] v_mfma_f32_32x32x8f16 a[224:239], v[3:4], v[11:12], a[224:239] v_mfma_f32_32x32x8f16 a[208:223], v[3:4], v[15:16], a[208:223] v_mfma_f32_32x32x8f16 a[192:207], v[3:4], v[19:20], a[192:207] ds_read2_b64 v[1:4], v50 offset1:1 s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[128:143], v[1:2], v[5:6], a[128:143] v_mfma_f32_32x32x8f16 a[144:159], v[1:2], v[9:10], a[144:159] v_mfma_f32_32x32x8f16 a[160:175], v[1:2], v[13:14], a[160:175] v_mfma_f32_32x32x8f16 a[176:191], v[1:2], v[17:18], a[176:191] v_mfma_f32_32x32x8f16 a[128:143], v[3:4], v[7:8], a[128:143] v_mfma_f32_32x32x8f16 a[144:159], v[3:4], v[11:12], a[144:159] v_mfma_f32_32x32x8f16 a[160:175], v[3:4], v[15:16], a[160:175] v_mfma_f32_32x32x8f16 a[176:191], v[3:4], v[19:20], a[176:191] ds_read2_b64 v[1:4], v48 offset1:1 s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[112:127], v[1:2], v[5:6], a[112:127] v_mfma_f32_32x32x8f16 a[96:111], v[1:2], v[9:10], a[96:111] v_mfma_f32_32x32x8f16 a[80:95], v[1:2], v[13:14], a[80:95] v_mfma_f32_32x32x8f16 a[64:79], v[1:2], v[17:18], a[64:79] v_mfma_f32_32x32x8f16 a[112:127], v[3:4], v[7:8], a[112:127] v_mfma_f32_32x32x8f16 a[96:111], v[3:4], v[11:12], a[96:111] v_mfma_f32_32x32x8f16 a[80:95], v[3:4], v[15:16], a[80:95] v_mfma_f32_32x32x8f16 a[64:79], v[3:4], v[19:20], a[64:79] ds_read2_b64 v[1:4], v46 offset1:1 s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[0:15], v[1:2], v[5:6], a[0:15] v_add_u32_e32 v5, s0, v54 v_mfma_f32_32x32x8f16 a[16:31], v[1:2], v[9:10], a[16:31] v_add_u32_e32 v9, s0, v53 v_mfma_f32_32x32x8f16 a[32:47], v[1:2], v[13:14], a[32:47] v_add_u32_e32 v13, s0, v52 v_mfma_f32_32x32x8f16 a[48:63], v[1:2], v[17:18], a[48:63] v_add_u32_e32 v1, s0, v41 v_add_u32_e32 v17, s0, v51 v_mfma_f32_32x32x8f16 a[0:15], v[3:4], v[7:8], a[0:15] ds_read2_b64 v[5:8], v5 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[16:31], v[3:4], v[11:12], a[16:31] ds_read2_b64 v[9:12], v9 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[32:47], v[3:4], v[15:16], a[32:47] ds_read2_b64 v[13:16], v13 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[48:63], v[3:4], v[19:20], a[48:63] ds_read2_b64 v[1:4], v1 offset0:32 offset1:33 ds_read2_b64 v[17:20], v17 offset0:32 offset1:33 s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[240:255], v[1:2], v[5:6], a[240:255] v_mfma_f32_32x32x8f16 a[224:239], v[1:2], v[9:10], a[224:239] v_mfma_f32_32x32x8f16 a[208:223], v[1:2], v[13:14], a[208:223] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[192:207], v[1:2], v[17:18], a[192:207] v_add_u32_e32 v1, s0, v50 s_waitcnt vmcnt(2) ds_read2_b64 v[21:24], v1 offset0:32 offset1:33 v_add_u32_e32 v1, s0, v48 s_waitcnt vmcnt(1) ds_read2_b64 v[25:28], v1 offset0:32 offset1:33 v_add_u32_e32 v1, s0, v46 ds_read2_b64 v[49:52], v1 offset0:32 offset1:33 s_movk_i32 s0, 0x80 v_cmp_gt_u32_e32 vcc, s0, v0 s_waitcnt lgkmcnt(2) v_mfma_f32_32x32x8f16 a[128:143], v[21:22], v[5:6], a[128:143] v_mfma_f32_32x32x8f16 a[144:159], v[21:22], v[9:10], a[144:159] v_mfma_f32_32x32x8f16 a[160:175], v[21:22], v[13:14], a[160:175] v_mfma_f32_32x32x8f16 a[176:191], v[21:22], v[17:18], a[176:191] s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[112:127], v[25:26], v[5:6], a[112:127] v_mfma_f32_32x32x8f16 a[96:111], v[25:26], v[9:10], a[96:111] v_mfma_f32_32x32x8f16 a[80:95], v[25:26], v[13:14], a[80:95] v_mfma_f32_32x32x8f16 a[64:79], v[25:26], v[17:18], a[64:79] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[0:15], v[49:50], v[5:6], a[0:15] v_mfma_f32_32x32x8f16 a[16:31], v[49:50], v[9:10], a[16:31] v_mfma_f32_32x32x8f16 a[32:47], v[49:50], v[13:14], a[32:47] v_mfma_f32_32x32x8f16 a[48:63], v[49:50], v[17:18], a[48:63] v_mul_i32_i24_e32 v17, 0xffffffe0, v29 v_mov_b32_e32 v49, 0 v_mov_b32_e32 v50, 0 v_mfma_f32_32x32x8f16 a[240:255], v[3:4], v[7:8], a[240:255] v_mfma_f32_32x32x8f16 a[224:239], v[3:4], v[11:12], a[224:239] s_waitcnt vmcnt(0) s_nop 7 s_nop 7 v_accvgpr_read_b32 v33, a240 v_accvgpr_read_b32 v34, a241 v_accvgpr_read_b32 v35, a242 v_accvgpr_read_b32 v36, a243 v_accvgpr_read_b32 v37, a244 v_accvgpr_read_b32 v38, a245 v_accvgpr_read_b32 v39, a246 v_accvgpr_read_b32 v40, a247 v_accvgpr_read_b32 v41, a248 v_accvgpr_read_b32 v42, a249 v_accvgpr_read_b32 v43, a250 v_accvgpr_read_b32 v44, a251 v_accvgpr_read_b32 v45, a252 v_mfma_f32_32x32x8f16 a[192:207], v[3:4], v[19:20], a[192:207] v_accvgpr_read_b32 v46, a253 v_accvgpr_read_b32 v47, a254 v_accvgpr_read_b32 v48, a255 v_mfma_f32_32x32x8f16 a[144:159], v[23:24], v[11:12], a[144:159] v_mfma_f32_32x32x8f16 a[176:191], v[23:24], v[19:20], a[176:191] v_mfma_f32_32x32x8f16 a[96:111], v[27:28], v[11:12], a[96:111] v_mfma_f32_32x32x8f16 a[64:79], v[27:28], v[19:20], a[64:79] v_mfma_f32_32x32x8f16 a[16:31], v[51:52], v[11:12], a[16:31] v_mfma_f32_32x32x8f16 a[48:63], v[51:52], v[19:20], a[48:63] v_mfma_f32_32x32x8f16 a[128:143], v[23:24], v[7:8], a[128:143] v_mfma_f32_32x32x8f16 a[112:127], v[27:28], v[7:8], a[112:127] v_mfma_f32_32x32x8f16 a[0:15], v[51:52], v[7:8], a[0:15] v_mfma_f32_32x32x8f16 a[208:223], v[3:4], v[15:16], a[208:223] v_mfma_f32_32x32x8f16 a[80:95], v[27:28], v[15:16], a[80:95] v_mfma_f32_32x32x8f16 a[160:175], v[23:24], v[15:16], a[160:175] v_mfma_f32_32x32x8f16 a[32:47], v[51:52], v[15:16], a[32:47] s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_4 ; %bb.3: v_lshrrev_b32_e32 v1, 2, v0 v_mul_i32_i24_e32 v2, -4, v1 v_add_u32_e32 v1, v17, v1 v_lshlrev_b32_e32 v3, 1, v1 v_add_u32_e32 v4, s42, v29 v_lshl_add_u32 v3, v4, 8, v3 v_mul_lo_u32 v3, v3, s33 v_add_lshl_u32 v2, v2, v0, 4 v_lshlrev_b32_e32 v4, 12, v29 v_lshlrev_b32_e32 v1, 7, v1 v_add3_u32 v50, v2, v4, v1 v_add3_u32 v49, s43, v2, v3 .LBB1_4: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EEC2ERSO_RKNSA_IJiiiiEEES15_S1A_RKS3_.exit.i s_or_b64 exec, exec, s[0:1] v_lshrrev_b32_e32 v0, 3, v0 v_and_or_b32 v0, v0, 4, v30 v_lshlrev_b32_e32 v19, 5, v29 v_lshrrev_b32_e32 v18, 6, v31 v_add3_u32 v0, v0, v17, v19 v_sub_u32_e32 v0, v0, v18 v_lshlrev_b32_e32 v0, 6, v0 v_cvt_f16_f32_e32 v17, v33 v_add_lshl_u32 v51, v0, v31, 1 v_cvt_f16_f32_e32 v0, v34 v_cvt_f16_f32_e32 v18, v35 v_cvt_f16_f32_e32 v19, v36 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v17 ds_write_b16 v51, v0 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v0, v40 v_cvt_f16_f32_e32 v17, v39 v_cvt_f16_f32_e32 v18, v38 v_cvt_f16_f32_e32 v19, v37 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v0, v41 v_cvt_f16_f32_e32 v17, v42 v_cvt_f16_f32_e32 v18, v43 v_cvt_f16_f32_e32 v19, v44 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v0, v48 v_cvt_f16_f32_e32 v17, v47 v_accvgpr_read_b32 v1, a224 v_cvt_f16_f32_e32 v18, v46 v_accvgpr_read_b32 v2, a225 v_accvgpr_read_b32 v3, a226 v_accvgpr_read_b32 v4, a227 v_accvgpr_read_b32 v5, a228 v_accvgpr_read_b32 v6, a229 v_accvgpr_read_b32 v7, a230 v_accvgpr_read_b32 v8, a231 v_accvgpr_read_b32 v9, a232 v_accvgpr_read_b32 v10, a233 v_accvgpr_read_b32 v11, a234 v_accvgpr_read_b32 v12, a235 v_accvgpr_read_b32 v13, a236 v_accvgpr_read_b32 v14, a237 v_accvgpr_read_b32 v15, a238 v_accvgpr_read_b32 v16, a239 v_cvt_f16_f32_e32 v19, v45 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_6 ; %bb.5: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i.i.i.i v_lshlrev_b32_e32 v0, 1, v50 ds_read2_b64 v[17:20], v0 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v21, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v17, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v20, v21, s[8:11], 12 offen ds_read2_b64 v[17:20], v0 offset0:2 offset1:3 v_add_u32_e32 v21, 8, v49 v_lshlrev_b32_e32 v22, 1, v21 v_add_lshl_u32 v21, v21, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v17, v22, s[8:11], 0 offen buffer_atomic_pk_add_f16 v18, v22, s[8:11], 4 offen buffer_atomic_pk_add_f16 v19, v22, s[8:11], 8 offen buffer_atomic_pk_add_f16 v20, v22, s[8:11], 12 offen ds_read2_b64 v[17:20], v0 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v17, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v20, v21, s[8:11], 12 offen ds_read2_b64 v[17:20], v0 offset0:16 offset1:17 v_add_lshl_u32 v0, v49, s33, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v17, v0, s[8:11], 0 offen buffer_atomic_pk_add_f16 v18, v0, s[8:11], 4 offen buffer_atomic_pk_add_f16 v19, v0, s[8:11], 8 offen buffer_atomic_pk_add_f16 v20, v0, s[8:11], 12 offen .LBB1_6: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v1 v_cvt_f16_f32_e32 v1, v2 v_cvt_f16_f32_e32 v2, v3 v_cvt_f16_f32_e32 v3, v4 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v7 v_cvt_f16_f32_e32 v2, v6 v_cvt_f16_f32_e32 v3, v5 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v9 v_cvt_f16_f32_e32 v1, v10 v_cvt_f16_f32_e32 v2, v11 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v16 v_cvt_f16_f32_e32 v1, v15 v_accvgpr_read_b32 v33, a208 v_cvt_f16_f32_e32 v2, v14 v_accvgpr_read_b32 v34, a209 v_accvgpr_read_b32 v35, a210 v_accvgpr_read_b32 v36, a211 v_accvgpr_read_b32 v37, a212 v_accvgpr_read_b32 v38, a213 v_accvgpr_read_b32 v39, a214 v_accvgpr_read_b32 v40, a215 v_accvgpr_read_b32 v41, a216 v_accvgpr_read_b32 v42, a217 v_accvgpr_read_b32 v43, a218 v_accvgpr_read_b32 v44, a219 v_accvgpr_read_b32 v45, a220 v_accvgpr_read_b32 v46, a221 v_accvgpr_read_b32 v47, a222 v_accvgpr_read_b32 v48, a223 v_cvt_f16_f32_e32 v3, v13 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_8 ; %bb.7: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i97.i.i.i.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s33, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[8:11], 12 offen .LBB1_8: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_122.i.i.i.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v33 v_cvt_f16_f32_e32 v1, v34 v_cvt_f16_f32_e32 v2, v35 v_cvt_f16_f32_e32 v3, v36 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v40 v_cvt_f16_f32_e32 v1, v39 v_cvt_f16_f32_e32 v2, v38 v_cvt_f16_f32_e32 v3, v37 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v41 v_cvt_f16_f32_e32 v1, v42 v_cvt_f16_f32_e32 v2, v43 v_cvt_f16_f32_e32 v3, v44 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v48 v_cvt_f16_f32_e32 v1, v47 v_accvgpr_read_b32 v16, a192 v_cvt_f16_f32_e32 v2, v46 v_accvgpr_read_b32 v17, a193 v_accvgpr_read_b32 v18, a194 v_accvgpr_read_b32 v19, a195 v_accvgpr_read_b32 v20, a196 v_accvgpr_read_b32 v21, a197 v_accvgpr_read_b32 v22, a198 v_accvgpr_read_b32 v23, a199 v_accvgpr_read_b32 v24, a200 v_accvgpr_read_b32 v25, a201 v_accvgpr_read_b32 v26, a202 v_accvgpr_read_b32 v27, a203 v_accvgpr_read_b32 v28, a204 v_accvgpr_read_b32 v29, a205 v_accvgpr_read_b32 v30, a206 v_accvgpr_read_b32 v31, a207 v_cvt_f16_f32_e32 v3, v45 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_10 ; %bb.9: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i219.i.i.i.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s33, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[8:11], 12 offen .LBB1_10: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_244.i.i.i.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a176 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a177 v_accvgpr_read_b32 v2, a178 v_accvgpr_read_b32 v3, a179 v_accvgpr_read_b32 v4, a180 v_accvgpr_read_b32 v5, a181 v_accvgpr_read_b32 v6, a182 v_accvgpr_read_b32 v7, a183 v_accvgpr_read_b32 v8, a184 v_accvgpr_read_b32 v9, a185 v_accvgpr_read_b32 v10, a186 v_accvgpr_read_b32 v11, a187 v_accvgpr_read_b32 v12, a188 v_accvgpr_read_b32 v13, a189 v_accvgpr_read_b32 v14, a190 v_accvgpr_read_b32 v15, a191 v_cvt_f16_f32_e32 v19, v28 s_mul_i32 s2, s33, 63 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_12 ; %bb.11: ; %_ZNK2ck10static_forILi0ELi4ELi1EEclIZZNS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS6_IJiiiEEELb0EEENS7_INS6_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESF_NS_23Merge_v2_magic_divisionINS6_IJiiEEEEESF_NSE_ISB_EENS7_ISH_Lb0EEESF_SJ_EEENS6_IJNS_8SequenceIJLi0EEEENSM_IJLi1EEEENSM_IJLi2EEEENSM_IJLi3EEEENSM_IJLi4ELi6EEEENSM_IJLi7EEEENSM_IJLi5EEEENSM_IJLi8EEEENSM_IJLi9EEEENSM_IJLi10EEEEEEENS6_IJNSM_IJLi1ELi2ELi3EEEENSM_IJLi4ELi5EEEENSM_IJLi6EEEESS_SU_SV_SW_NSM_IJLi11ELi12EEEENSM_IJLi13EEEENSM_IJLi14EEEEEEENSM_IJLi11ELi12ELi13ELi14EEEEiEENS5_INS6_IJNS7_INS6_IJiiiiEEELb0EEESF_NS_3PadIiiiLb0EEES1A_SF_SF_NS_5EmbedISH_SH_Lb0EEES1C_SF_SD_SF_SF_SF_SF_SF_NSG_IS8_EES1D_SJ_SK_SF_SJ_EEENS6_IJSN_SO_SP_SQ_NSM_IJLi4EEEEST_S10_SS_SU_SV_SW_NSM_IJLi11EEEENSM_IJLi12EEEES12_S13_NSM_IJLi15ELi18ELi20EEEENSM_IJLi17ELi19ELi21EEEENSM_IJLi16EEEENSM_IJLi22EEEENSM_IJLi23EEEENSM_IJLi24EEEEEEENS6_IJNSM_IJLi1ELi2ELi3ELi4EEEEST_.i.i.i.i v_lshlrev_b32_e32 v20, 1, v50 ds_read2_b64 v[16:19], v20 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v21, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:2 offset1:3 v_add_u32_e32 v21, 8, v49 v_lshlrev_b32_e32 v22, 1, v21 v_add_lshl_u32 v21, v21, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v22, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v22, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v22, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v22, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:16 offset1:17 v_add_u32_e32 v20, s33, v49 v_lshlrev_b32_e32 v21, 1, v20 v_add_u32_e32 v49, s2, v20 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen .LBB1_12: ; %_ZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_I.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a160 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a161 v_accvgpr_read_b32 v18, a162 v_accvgpr_read_b32 v19, a163 v_accvgpr_read_b32 v20, a164 v_accvgpr_read_b32 v21, a165 v_accvgpr_read_b32 v22, a166 v_accvgpr_read_b32 v23, a167 v_accvgpr_read_b32 v24, a168 v_accvgpr_read_b32 v25, a169 v_accvgpr_read_b32 v26, a170 v_accvgpr_read_b32 v27, a171 v_accvgpr_read_b32 v28, a172 v_accvgpr_read_b32 v29, a173 v_accvgpr_read_b32 v30, a174 v_accvgpr_read_b32 v31, a175 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_14 ; %bb.13: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i108.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s33, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[8:11], 12 offen .LBB1_14: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i156.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a144 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a145 v_accvgpr_read_b32 v2, a146 v_accvgpr_read_b32 v3, a147 v_accvgpr_read_b32 v4, a148 v_accvgpr_read_b32 v5, a149 v_accvgpr_read_b32 v6, a150 v_accvgpr_read_b32 v7, a151 v_accvgpr_read_b32 v8, a152 v_accvgpr_read_b32 v9, a153 v_accvgpr_read_b32 v10, a154 v_accvgpr_read_b32 v11, a155 v_accvgpr_read_b32 v12, a156 v_accvgpr_read_b32 v13, a157 v_accvgpr_read_b32 v14, a158 v_accvgpr_read_b32 v15, a159 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_16 ; %bb.15: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i97.i.i.i224.i.i.i v_lshlrev_b32_e32 v20, 1, v50 ds_read2_b64 v[16:19], v20 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v21, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:2 offset1:3 v_add_u32_e32 v21, 8, v49 v_lshlrev_b32_e32 v22, 1, v21 v_add_lshl_u32 v21, v21, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v22, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v22, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v22, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v22, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:16 offset1:17 v_add_lshl_u32 v20, v49, s33, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v20, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v20, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v20, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v20, s[8:11], 12 offen .LBB1_16: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_122.i.i.i272.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a128 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a129 v_accvgpr_read_b32 v18, a130 v_accvgpr_read_b32 v19, a131 v_accvgpr_read_b32 v20, a132 v_accvgpr_read_b32 v21, a133 v_accvgpr_read_b32 v22, a134 v_accvgpr_read_b32 v23, a135 v_accvgpr_read_b32 v24, a136 v_accvgpr_read_b32 v25, a137 v_accvgpr_read_b32 v26, a138 v_accvgpr_read_b32 v27, a139 v_accvgpr_read_b32 v28, a140 v_accvgpr_read_b32 v29, a141 v_accvgpr_read_b32 v30, a142 v_accvgpr_read_b32 v31, a143 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_18 ; %bb.17: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i219.i.i.i340.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s33, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[8:11], 12 offen .LBB1_18: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_244.i.i.i388.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a112 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a113 v_accvgpr_read_b32 v2, a114 v_accvgpr_read_b32 v3, a115 v_accvgpr_read_b32 v4, a116 v_accvgpr_read_b32 v5, a117 v_accvgpr_read_b32 v6, a118 v_accvgpr_read_b32 v7, a119 v_accvgpr_read_b32 v8, a120 v_accvgpr_read_b32 v9, a121 v_accvgpr_read_b32 v10, a122 v_accvgpr_read_b32 v11, a123 v_accvgpr_read_b32 v12, a124 v_accvgpr_read_b32 v13, a125 v_accvgpr_read_b32 v14, a126 v_accvgpr_read_b32 v15, a127 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_20 ; %bb.19: ; %_ZNK2ck10static_forILi0ELi4ELi1EEclIZZNS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS6_IJiiiEEELb0EEENS7_INS6_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESF_NS_23Merge_v2_magic_divisionINS6_IJiiEEEEESF_NSE_ISB_EENS7_ISH_Lb0EEESF_SJ_EEENS6_IJNS_8SequenceIJLi0EEEENSM_IJLi1EEEENSM_IJLi2EEEENSM_IJLi3EEEENSM_IJLi4ELi6EEEENSM_IJLi7EEEENSM_IJLi5EEEENSM_IJLi8EEEENSM_IJLi9EEEENSM_IJLi10EEEEEEENS6_IJNSM_IJLi1ELi2ELi3EEEENSM_IJLi4ELi5EEEENSM_IJLi6EEEESS_SU_SV_SW_NSM_IJLi11ELi12EEEENSM_IJLi13EEEENSM_IJLi14EEEEEEENSM_IJLi11ELi12ELi13ELi14EEEEiEENS5_INS6_IJNS7_INS6_IJiiiiEEELb0EEESF_NS_3PadIiiiLb0EEES1A_SF_SF_NS_5EmbedISH_SH_Lb0EEES1C_SF_SD_SF_SF_SF_SF_SF_NSG_IS8_EES1D_SJ_SK_SF_SJ_EEENS6_IJSN_SO_SP_SQ_NSM_IJLi4EEEEST_S10_SS_SU_SV_SW_NSM_IJLi11EEEENSM_IJLi12EEEES12_S13_NSM_IJLi15ELi18ELi20EEEENSM_IJLi17ELi19ELi21EEEENSM_IJLi16EEEENSM_IJLi22EEEENSM_IJLi23EEEENSM_IJLi24EEEEEEENS6_IJNSM_IJLi1ELi2ELi3ELi4EEEEST_.i444.i.i.i v_lshlrev_b32_e32 v20, 1, v50 ds_read2_b64 v[16:19], v20 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v21, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:2 offset1:3 v_add_u32_e32 v21, 8, v49 v_lshlrev_b32_e32 v22, 1, v21 v_add_lshl_u32 v21, v21, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v22, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v22, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v22, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v22, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:16 offset1:17 v_add_u32_e32 v20, s33, v49 v_lshlrev_b32_e32 v21, 1, v20 v_add_u32_e32 v49, s2, v20 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen .LBB1_20: ; %_ZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_I469.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a96 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a97 v_accvgpr_read_b32 v18, a98 v_accvgpr_read_b32 v19, a99 v_accvgpr_read_b32 v20, a100 v_accvgpr_read_b32 v21, a101 v_accvgpr_read_b32 v22, a102 v_accvgpr_read_b32 v23, a103 v_accvgpr_read_b32 v24, a104 v_accvgpr_read_b32 v25, a105 v_accvgpr_read_b32 v26, a106 v_accvgpr_read_b32 v27, a107 v_accvgpr_read_b32 v28, a108 v_accvgpr_read_b32 v29, a109 v_accvgpr_read_b32 v30, a110 v_accvgpr_read_b32 v31, a111 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_22 ; %bb.21: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i577.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s33, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[8:11], 12 offen .LBB1_22: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i625.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a80 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a81 v_accvgpr_read_b32 v2, a82 v_accvgpr_read_b32 v3, a83 v_accvgpr_read_b32 v4, a84 v_accvgpr_read_b32 v5, a85 v_accvgpr_read_b32 v6, a86 v_accvgpr_read_b32 v7, a87 v_accvgpr_read_b32 v8, a88 v_accvgpr_read_b32 v9, a89 v_accvgpr_read_b32 v10, a90 v_accvgpr_read_b32 v11, a91 v_accvgpr_read_b32 v12, a92 v_accvgpr_read_b32 v13, a93 v_accvgpr_read_b32 v14, a94 v_accvgpr_read_b32 v15, a95 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_24 ; %bb.23: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i97.i.i.i693.i.i.i v_lshlrev_b32_e32 v20, 1, v50 ds_read2_b64 v[16:19], v20 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v21, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:2 offset1:3 v_add_u32_e32 v21, 8, v49 v_lshlrev_b32_e32 v22, 1, v21 v_add_lshl_u32 v21, v21, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v22, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v22, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v22, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v22, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:16 offset1:17 v_add_lshl_u32 v20, v49, s33, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v20, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v20, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v20, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v20, s[8:11], 12 offen .LBB1_24: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_122.i.i.i741.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a64 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a65 v_accvgpr_read_b32 v18, a66 v_accvgpr_read_b32 v19, a67 v_accvgpr_read_b32 v20, a68 v_accvgpr_read_b32 v21, a69 v_accvgpr_read_b32 v22, a70 v_accvgpr_read_b32 v23, a71 v_accvgpr_read_b32 v24, a72 v_accvgpr_read_b32 v25, a73 v_accvgpr_read_b32 v26, a74 v_accvgpr_read_b32 v27, a75 v_accvgpr_read_b32 v28, a76 v_accvgpr_read_b32 v29, a77 v_accvgpr_read_b32 v30, a78 v_accvgpr_read_b32 v31, a79 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_26 ; %bb.25: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i219.i.i.i809.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s33, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[8:11], 12 offen .LBB1_26: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_244.i.i.i857.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a48 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a49 v_accvgpr_read_b32 v2, a50 v_accvgpr_read_b32 v3, a51 v_accvgpr_read_b32 v4, a52 v_accvgpr_read_b32 v5, a53 v_accvgpr_read_b32 v6, a54 v_accvgpr_read_b32 v7, a55 v_accvgpr_read_b32 v8, a56 v_accvgpr_read_b32 v9, a57 v_accvgpr_read_b32 v10, a58 v_accvgpr_read_b32 v11, a59 v_accvgpr_read_b32 v12, a60 v_accvgpr_read_b32 v13, a61 v_accvgpr_read_b32 v14, a62 v_accvgpr_read_b32 v15, a63 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_28 ; %bb.27: ; %_ZNK2ck10static_forILi0ELi4ELi1EEclIZZNS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS6_IJiiiEEELb0EEENS7_INS6_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESF_NS_23Merge_v2_magic_divisionINS6_IJiiEEEEESF_NSE_ISB_EENS7_ISH_Lb0EEESF_SJ_EEENS6_IJNS_8SequenceIJLi0EEEENSM_IJLi1EEEENSM_IJLi2EEEENSM_IJLi3EEEENSM_IJLi4ELi6EEEENSM_IJLi7EEEENSM_IJLi5EEEENSM_IJLi8EEEENSM_IJLi9EEEENSM_IJLi10EEEEEEENS6_IJNSM_IJLi1ELi2ELi3EEEENSM_IJLi4ELi5EEEENSM_IJLi6EEEESS_SU_SV_SW_NSM_IJLi11ELi12EEEENSM_IJLi13EEEENSM_IJLi14EEEEEEENSM_IJLi11ELi12ELi13ELi14EEEEiEENS5_INS6_IJNS7_INS6_IJiiiiEEELb0EEESF_NS_3PadIiiiLb0EEES1A_SF_SF_NS_5EmbedISH_SH_Lb0EEES1C_SF_SD_SF_SF_SF_SF_SF_NSG_IS8_EES1D_SJ_SK_SF_SJ_EEENS6_IJSN_SO_SP_SQ_NSM_IJLi4EEEEST_S10_SS_SU_SV_SW_NSM_IJLi11EEEENSM_IJLi12EEEES12_S13_NSM_IJLi15ELi18ELi20EEEENSM_IJLi17ELi19ELi21EEEENSM_IJLi16EEEENSM_IJLi22EEEENSM_IJLi23EEEENSM_IJLi24EEEEEEENS6_IJNSM_IJLi1ELi2ELi3ELi4EEEEST_.i913.i.i.i v_lshlrev_b32_e32 v20, 1, v50 ds_read2_b64 v[16:19], v20 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v21, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:2 offset1:3 v_add_u32_e32 v21, 8, v49 v_lshlrev_b32_e32 v22, 1, v21 v_add_lshl_u32 v21, v21, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v22, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v22, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v22, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v22, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:16 offset1:17 v_add_u32_e32 v20, s33, v49 v_lshlrev_b32_e32 v21, 1, v20 v_add_u32_e32 v49, s2, v20 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen .LBB1_28: ; %_ZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_I938.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a32 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a33 v_accvgpr_read_b32 v18, a34 v_accvgpr_read_b32 v19, a35 v_accvgpr_read_b32 v20, a36 v_accvgpr_read_b32 v21, a37 v_accvgpr_read_b32 v22, a38 v_accvgpr_read_b32 v23, a39 v_accvgpr_read_b32 v24, a40 v_accvgpr_read_b32 v25, a41 v_accvgpr_read_b32 v26, a42 v_accvgpr_read_b32 v27, a43 v_accvgpr_read_b32 v28, a44 v_accvgpr_read_b32 v29, a45 v_accvgpr_read_b32 v30, a46 v_accvgpr_read_b32 v31, a47 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_30 ; %bb.29: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i1046.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s33, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[8:11], 12 offen .LBB1_30: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i1094.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a16 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a17 v_accvgpr_read_b32 v2, a18 v_accvgpr_read_b32 v3, a19 v_accvgpr_read_b32 v4, a20 v_accvgpr_read_b32 v5, a21 v_accvgpr_read_b32 v6, a22 v_accvgpr_read_b32 v7, a23 v_accvgpr_read_b32 v8, a24 v_accvgpr_read_b32 v9, a25 v_accvgpr_read_b32 v10, a26 v_accvgpr_read_b32 v11, a27 v_accvgpr_read_b32 v12, a28 v_accvgpr_read_b32 v13, a29 v_accvgpr_read_b32 v14, a30 v_accvgpr_read_b32 v15, a31 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_32 ; %bb.31: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i97.i.i.i1162.i.i.i v_lshlrev_b32_e32 v20, 1, v50 ds_read2_b64 v[16:19], v20 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v21, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:2 offset1:3 v_add_u32_e32 v21, 8, v49 v_lshlrev_b32_e32 v22, 1, v21 v_add_lshl_u32 v21, v21, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v22, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v22, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v22, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v22, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[8:11], 12 offen ds_read2_b64 v[16:19], v20 offset0:16 offset1:17 v_add_lshl_u32 v20, v49, s33, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v20, s[8:11], 0 offen buffer_atomic_pk_add_f16 v17, v20, s[8:11], 4 offen buffer_atomic_pk_add_f16 v18, v20, s[8:11], 8 offen buffer_atomic_pk_add_f16 v19, v20, s[8:11], 12 offen .LBB1_32: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_122.i.i.i1210.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v31, a15 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v30, a14 v_accvgpr_read_b32 v29, a13 v_accvgpr_read_b32 v28, a12 v_accvgpr_read_b32 v27, a11 v_accvgpr_read_b32 v26, a10 v_accvgpr_read_b32 v25, a9 v_accvgpr_read_b32 v24, a8 v_accvgpr_read_b32 v23, a7 v_accvgpr_read_b32 v22, a6 v_accvgpr_read_b32 v21, a5 v_accvgpr_read_b32 v20, a4 v_accvgpr_read_b32 v19, a3 v_accvgpr_read_b32 v18, a2 v_accvgpr_read_b32 v17, a1 v_accvgpr_read_b32 v16, a0 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_34 ; %bb.33: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i219.i.i.i1278.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s33, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[8:11], 12 offen .LBB1_34: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_244.i.i.i1326.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v16 v_cvt_f16_f32_e32 v1, v17 v_cvt_f16_f32_e32 v2, v18 v_cvt_f16_f32_e32 v3, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v23 v_cvt_f16_f32_e32 v1, v22 v_cvt_f16_f32_e32 v2, v21 v_cvt_f16_f32_e32 v3, v20 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v24 v_cvt_f16_f32_e32 v1, v25 v_cvt_f16_f32_e32 v2, v26 v_cvt_f16_f32_e32 v3, v27 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v31 v_cvt_f16_f32_e32 v1, v30 v_cvt_f16_f32_e32 v2, v29 v_cvt_f16_f32_e32 v3, v28 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB1_36 ; %bb.35: v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_lshl_b32 s10, s41, 1 s_mov_b32 s11, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[8:11], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s33, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[8:11], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[8:11], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[8:11], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[8:11], 12 offen .LBB1_36: ; %_ZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_IJ s_endpgm .section .rodata,#alloc .p2align 6 .amdhsa_kernel _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .amdhsa_group_segment_fixed_size 34816 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 544 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 256 .amdhsa_next_free_sgpr 48 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_,#alloc,#execinstr .Lfunc_end1: .size _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_, .Lfunc_end1-_ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 16732 ; NumSgprs: 50 ; NumVgprs: 93 ; NumAgprs: 256 ; TotalNumVgprs: 256 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 34816 bytes/workgroup (compile time only) ; SGPRBlocks: 6 ; VGPRBlocks: 63 ; NumSGPRsForWavesPerEU: 50 ; NumVGPRsForWavesPerEU: 256 ; Occupancy: 1 ; WaveLimiterHint : 0 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 .section .text._ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_,#alloc,#execinstr .protected _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ ; -- Begin function _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .globl _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .p2align 8 .type _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_,@function _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_: ; @_ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ ; %bb.0: s_load_dwordx2 s[0:1], s[4:5], 0x0 s_load_dwordx2 s[8:9], s[4:5], 0x8 s_load_dwordx2 s[24:25], s[4:5], 0x10 s_load_dwordx2 s[10:11], s[4:5], 0x24 s_load_dword s33, s[4:5], 0x48 s_load_dword s26, s[4:5], 0x50 s_load_dword s54, s[4:5], 0x58 s_load_dword s27, s[4:5], 0x70 s_load_dword s55, s[4:5], 0x84 s_load_dwordx4 s[12:15], s[4:5], 0x98 s_load_dwordx4 s[16:19], s[4:5], 0xac s_load_dwordx2 s[20:21], s[4:5], 0xbc s_load_dwordx2 s[2:3], s[4:5], 0xd4 s_load_dwordx2 s[22:23], s[4:5], 0xe4 s_load_dwordx2 s[34:35], s[4:5], 0x114 s_load_dwordx2 s[44:45], s[4:5], 0x120 s_load_dwordx2 s[46:47], s[4:5], 0x12c s_load_dwordx2 s[48:49], s[4:5], 0x13c s_load_dwordx2 s[50:51], s[4:5], 0x148 s_load_dwordx4 s[28:31], s[4:5], 0x1e0 s_load_dwordx4 s[36:39], s[4:5], 0x1f4 s_load_dwordx4 s[40:43], s[4:5], 0x208 s_load_dwordx2 s[52:53], s[4:5], 0x154 v_lshrrev_b32_e32 v1, 5, v0 v_lshrrev_b32_e32 v17, 7, v0 s_waitcnt lgkmcnt(0) s_mul_hi_u32 s39, s39, s6 s_add_i32 s39, s6, s39 s_lshr_b32 s39, s39, s43 s_mul_i32 s31, s39, s31 s_sub_i32 s6, s6, s31 s_mul_hi_u32 s31, s39, s38 s_add_i32 s31, s39, s31 s_lshr_b32 s31, s31, s42 s_mul_hi_u32 s37, s31, s37 s_add_i32 s37, s31, s37 s_lshr_b32 s37, s37, s41 s_mul_i32 s29, s37, s29 s_mul_i32 s30, s31, s30 s_sub_i32 s29, s31, s29 s_mul_hi_u32 s31, s37, s36 s_add_i32 s31, s37, s31 s_lshr_b32 s31, s31, s40 v_mad_i32_i24 v18, v17, -4, v1 s_mul_i32 s27, s31, s27 v_add_u32_e32 v2, s27, v18 v_mul_hi_u32 v3, v2, s26 s_load_dword s15, s[4:5], 0x16c s_load_dword s56, s[4:5], 0x180 s_load_dword s7, s[4:5], 0x18c s_load_dword s57, s[4:5], 0x1c4 s_load_dword s58, s[4:5], 0x1d4 s_movk_i32 s27, 0xffe0 v_mad_i32_i24 v19, v1, s27, v0 v_add_u32_e32 v3, v2, v3 v_lshrrev_b32_e32 v3, s54, v3 v_mul_lo_u32 v4, v3, s33 s_waitcnt lgkmcnt(0) s_mul_i32 s29, s29, s58 s_add_i32 s29, s6, s29 s_lshl_b32 s26, s29, 8 v_lshlrev_b32_e32 v1, 3, v19 v_sub_u32_e32 v2, v2, v4 v_add_u32_e32 v4, s26, v1 v_mul_hi_u32 v5, v4, s51 s_mul_i32 s28, s31, s28 v_lshlrev_b32_e32 v37, 2, v17 s_mul_i32 s31, s31, s15 s_sub_i32 s28, s37, s28 v_lshl_or_b32 v3, v3, 3, v37 v_add_u32_e32 v6, s31, v18 s_sub_i32 s30, s39, s30 s_mul_i32 s6, s28, s57 v_mul_lo_u32 v3, v3, s10 v_mul_lo_u32 v2, v2, s11 v_add_u32_e32 v5, v4, v5 v_mul_hi_u32 v8, v6, s45 s_add_i32 s6, s6, s30 v_lshrrev_b32_e32 v5, s53, v5 s_lshl_b32 s28, s6, 8 v_mul_hi_u32 v7, v5, s50 v_add_u32_e32 v1, s28, v1 v_add3_u32 v1, v1, v3, v2 v_add_u32_e32 v3, v6, v8 v_lshrrev_b32_e32 v3, s47, v3 v_add_u32_e32 v2, v5, v7 v_mul_hi_u32 v7, v3, s44 v_lshrrev_b32_e32 v2, s52, v2 v_mul_lo_u32 v8, v2, s48 v_mul_lo_u32 v9, v3, s35 v_add_u32_e32 v7, v3, v7 v_lshrrev_b32_e32 v7, s46, v7 v_mul_lo_u32 v10, v7, s34 v_mul_lo_u32 v11, v5, s49 v_sub_u32_e32 v5, v5, v8 v_sub_u32_e32 v6, v6, v9 v_sub_u32_e32 v3, v3, v10 v_mul_lo_u32 v5, v5, s22 v_mul_lo_u32 v6, v6, s23 v_mul_lo_u32 v2, v2, s2 v_mul_lo_u32 v3, v3, s3 v_add_u32_e32 v9, s10, v1 v_add_u32_e32 v21, v6, v5 v_lshlrev_b32_e32 v5, 1, v9 v_add_u32_e32 v22, v3, v2 v_subrev_u32_e32 v2, s20, v21 v_add_u32_e32 v9, s10, v9 v_sub_u32_e32 v20, v4, v11 v_lshl_or_b32 v4, v7, 3, v37 v_subrev_u32_e32 v3, s17, v22 v_mul_lo_u32 v25, v2, s14 s_lshl_b32 s2, s55, 1 s_mov_b32 s3, 0x20000 v_lshlrev_b32_e32 v2, 1, v1 v_lshlrev_b32_e32 v26, 1, v9 v_mul_lo_u32 v23, v4, s12 v_mul_lo_u32 v24, v3, s13 buffer_load_dwordx4 v[1:4], v2, s[0:3], 0 offen s_nop 0 buffer_load_dwordx4 v[5:8], v5, s[0:3], 0 offen v_add_lshl_u32 v27, v9, s10, 1 buffer_load_dwordx4 v[9:12], v26, s[0:3], 0 offen buffer_load_dwordx4 v[13:16], v27, s[0:3], 0 offen s_sub_i32 s0, s19, s21 v_cmp_le_i32_e32 vcc, s20, v21 v_cmp_gt_i32_e64 s[0:1], s0, v21 s_and_b64 s[14:15], vcc, s[0:1] s_sub_i32 s0, s16, s18 v_cmp_le_i32_e32 vcc, s17, v22 v_cmp_gt_i32_e64 s[0:1], s0, v22 s_and_b64 s[0:1], vcc, s[0:1] v_add_u32_e32 v20, v20, v25 v_bfrev_b32_e32 v21, -2 s_and_b64 s[0:1], s[14:15], s[0:1] v_add3_u32 v20, v20, v23, v24 v_cndmask_b32_e64 v28, v21, 0, s[0:1] s_lshl_b32 s10, s56, 1 s_mov_b32 s11, s3 v_lshl_add_u32 v29, v20, 1, v28 v_add_u32_e32 v30, s12, v20 v_lshl_add_u32 v31, v30, 1, v28 buffer_load_dwordx4 v[20:23], v29, s[8:11], 0 offen buffer_load_dwordx4 v[24:27], v31, s[8:11], 0 offen v_add_u32_e32 v29, s12, v30 v_lshl_add_u32 v38, v29, 1, v28 v_add_u32_e32 v29, s12, v29 v_lshl_add_u32 v39, v29, 1, v28 buffer_load_dwordx4 v[28:31], v38, s[8:11], 0 offen buffer_load_dwordx4 v[33:36], v39, s[8:11], 0 offen s_movk_i32 s1, 0x880 v_mul_lo_u32 v18, v18, s1 s_movk_i32 s0, 0x44 v_mul_lo_u32 v41, v19, s0 v_and_b32_e32 v19, 32, v0 v_or_b32_e32 v42, v18, v37 v_and_b32_e32 v18, 63, v0 v_sub_u32_e32 v19, v18, v19 v_lshrrev_b32_e32 v18, 4, v0 v_and_b32_e32 v43, 2, v18 v_lshlrev_b32_e32 v18, 5, v17 v_add_u32_e32 v44, v19, v18 v_ashrrev_i16_e32 v37, 15, v44 v_lshrrev_b16_e32 v37, 13, v37 v_add_u16_e32 v37, v44, v37 v_ashrrev_i16_e32 v38, 3, v37 v_and_b32_e32 v37, -8, v37 v_sub_u16_e32 v37, v44, v37 v_bfe_i32 v45, v38, 0, 16 v_bfe_i32 v46, v37, 0, 16 v_mul_u32_u24_e32 v37, s1, v43 v_mad_i32_i24 v48, v45, s0, v37 v_lshrrev_b32_e32 v37, 6, v0 v_mad_i32_i24 v37, v17, -2, v37 v_lshl_add_u32 v19, v37, 5, v19 v_ashrrev_i32_e32 v37, 31, v19 v_lshrrev_b32_e32 v37, 29, v37 v_add_u32_e32 v49, v19, v37 v_ashrrev_i32_e32 v50, 3, v49 s_waitcnt vmcnt(6) ;;#ASMSTART v_pack_b32_f16 v37, v1, v5 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v39, v1, v5, op_sel:[1, 1] ;;#ASMEND s_waitcnt vmcnt(4) ;;#ASMSTART v_pack_b32_f16 v38, v9, v13 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v40, v9, v13, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v1, v2, v6 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v5, v2, v6, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v2, v10, v14 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v6, v10, v14, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v9, v3, v7 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v13, v3, v7, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v10, v11, v15 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v14, v11, v15, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v3, v4, v8 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v7, v4, v8, op_sel:[1, 1] ;;#ASMEND v_mul_lo_u32 v4, v50, s0 s_mov_b32 s8, 0 s_mov_b32 s23, s8 v_and_b32_e32 v8, -8, v49 s_mov_b32 s9, s8 s_mov_b32 s10, s8 s_mov_b32 s11, s8 s_mov_b32 s12, s8 s_mov_b32 s13, s8 s_mov_b32 s14, s8 s_mov_b32 s15, s8 s_mov_b32 s16, s8 s_mov_b32 s17, s8 s_mov_b32 s18, s8 s_mov_b32 s19, s8 s_mov_b32 s20, s8 s_mov_b32 s21, s8 s_mov_b32 s22, s8 v_mov_b32_e32 v32, s23 v_sub_u32_e32 v49, v19, v8 v_mad_u32_u24 v43, v43, s1, v4 ;;#ASMSTART v_pack_b32_f16 v4, v12, v16 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v8, v12, v16, op_sel:[1, 1] ;;#ASMEND v_add_lshl_u32 v41, v42, v41, 1 s_movk_i32 s1, 0x4000 ds_write2_b64 v41, v[37:38], v[39:40] offset1:2 ds_write2_b64 v41, v[1:2], v[5:6] offset0:4 offset1:6 ds_write2_b64 v41, v[9:10], v[13:14] offset0:8 offset1:10 ds_write2_b64 v41, v[3:4], v[7:8] offset0:12 offset1:14 s_waitcnt vmcnt(2) ;;#ASMSTART v_pack_b32_f16 v1, v20, v24 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v3, v20, v24, op_sel:[1, 1] ;;#ASMEND s_waitcnt vmcnt(0) ;;#ASMSTART v_pack_b32_f16 v2, v28, v33 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v4, v28, v33, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v5, v21, v25 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v7, v21, v25, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v6, v29, v34 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v8, v29, v34, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v9, v22, v26 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v11, v22, v26, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v10, v30, v35 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v12, v30, v35, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v13, v23, v27 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v15, v23, v27, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v14, v31, v36 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v16, v31, v36, op_sel:[1, 1] ;;#ASMEND v_add_u32_e32 v20, s1, v41 ds_write2_b64 v20, v[1:2], v[3:4] offset0:128 offset1:130 ds_write2_b64 v20, v[5:6], v[7:8] offset0:132 offset1:134 ds_write2_b64 v20, v[9:10], v[11:12] offset0:136 offset1:138 ds_write2_b64 v20, v[13:14], v[15:16] offset0:140 offset1:142 v_accvgpr_write_b32 a63, v32 v_mov_b32_e32 v23, s22 v_mov_b32_e32 v16, s21 v_mov_b32_e32 v32, s20 v_accvgpr_write_b32 a62, v23 v_accvgpr_write_b32 a61, v16 v_accvgpr_write_b32 a60, v32 v_mov_b32_e32 v23, s19 v_mov_b32_e32 v16, s18 v_mov_b32_e32 v32, s17 v_lshlrev_b32_e32 v47, 3, v46 v_accvgpr_write_b32 a59, v23 v_accvgpr_write_b32 a58, v16 v_accvgpr_write_b32 a57, v32 v_mov_b32_e32 v23, s16 v_mov_b32_e32 v16, s15 v_mov_b32_e32 v32, s14 v_add_lshl_u32 v24, v48, v47, 1 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_read2_b64 v[1:4], v24 offset1:1 v_lshlrev_b32_e32 v51, 3, v49 v_accvgpr_write_b32 a56, v23 v_accvgpr_write_b32 a55, v16 v_accvgpr_write_b32 a54, v32 v_mov_b32_e32 v23, s13 v_mov_b32_e32 v16, s12 v_mov_b32_e32 v32, s11 v_add_lshl_u32 v25, v43, v51, 1 v_add_u32_e32 v5, s1, v25 ds_read2_b64 v[5:8], v5 offset0:128 offset1:129 v_accvgpr_write_b32 a53, v23 v_accvgpr_write_b32 a52, v16 v_accvgpr_write_b32 a51, v32 v_mov_b32_e32 v23, s10 v_mov_b32_e32 v16, s9 v_mov_b32_e32 v32, s8 v_accvgpr_write_b32 a50, v23 v_accvgpr_write_b32 a49, v16 v_accvgpr_write_b32 a48, v32 v_add_u32_e32 v9, 64, v19 s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[240:255], v[1:2], v[5:6], a[48:63] s_movk_i32 s2, 0x80 s_movk_i32 s3, 0xc0 v_add_u32_e32 v13, s2, v19 v_add_u32_e32 v21, s3, v19 v_ashrrev_i32_e32 v10, 31, v9 v_ashrrev_i32_e32 v14, 31, v13 v_ashrrev_i32_e32 v22, 31, v21 v_lshrrev_b32_e32 v10, 29, v10 v_lshrrev_b32_e32 v14, 29, v14 v_lshrrev_b32_e32 v22, 29, v22 v_add_u32_e32 v10, v9, v10 v_add_u32_e32 v14, v13, v14 v_add_u32_e32 v22, v21, v22 v_ashrrev_i32_e32 v11, 3, v10 v_ashrrev_i32_e32 v15, 3, v14 v_ashrrev_i32_e32 v23, 3, v22 v_sub_u32_e32 v11, v11, v50 v_sub_u32_e32 v15, v15, v50 v_sub_u32_e32 v23, v23, v50 s_mov_b32 s1, 0xffffff8 v_mul_lo_u32 v11, v11, s0 v_mul_lo_u32 v15, v15, s0 v_mul_lo_u32 v23, v23, s0 v_and_b32_e32 v10, s1, v10 v_and_b32_e32 v14, s1, v14 v_and_b32_e32 v22, s1, v22 v_sub_u32_e32 v9, v9, v10 v_sub_u32_e32 v13, v13, v14 v_sub_u32_e32 v21, v21, v22 v_sub_u32_e32 v9, v9, v49 v_sub_u32_e32 v13, v13, v49 v_sub_u32_e32 v21, v21, v49 v_add_u32_e32 v20, 0x4400, v25 v_lshl_add_u32 v9, v9, 3, v11 v_lshl_add_u32 v13, v13, 3, v15 v_lshl_add_u32 v21, v21, 3, v23 v_lshl_add_u32 v26, v9, 1, v20 v_lshl_add_u32 v27, v13, 1, v20 v_lshl_add_u32 v28, v21, 1, v20 ds_read2_b64 v[9:12], v26 offset1:1 ds_read2_b64 v[13:16], v27 offset1:1 ds_read2_b64 v[20:23], v28 offset1:1 s_waitcnt lgkmcnt(2) v_mfma_f32_32x32x8f16 a[224:239], v[1:2], v[9:10], a[48:63] v_cmp_gt_u32_e32 vcc, s2, v0 s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[208:223], v[1:2], v[13:14], a[48:63] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[192:207], v[1:2], v[20:21], a[48:63] v_add_u32_e32 v1, 64, v44 v_lshrrev_b32_e32 v1, 3, v1 v_sub_u32_e32 v1, v1, v45 v_mul_lo_u32 v1, v1, s0 v_and_b32_e32 v2, 7, v44 v_sub_u32_e32 v2, v2, v46 v_lshlrev_b32_e32 v29, 3, v2 v_add_u32_e32 v1, v29, v1 v_lshl_add_u32 v30, v1, 1, v24 v_mfma_f32_32x32x8f16 a[240:255], v[3:4], v[7:8], a[240:255] v_mfma_f32_32x32x8f16 a[224:239], v[3:4], v[11:12], a[224:239] v_mfma_f32_32x32x8f16 a[208:223], v[3:4], v[15:16], a[208:223] v_mfma_f32_32x32x8f16 a[192:207], v[3:4], v[22:23], a[192:207] ds_read2_b64 v[1:4], v30 offset1:1 s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[128:143], v[1:2], v[5:6], a[48:63] v_mfma_f32_32x32x8f16 a[144:159], v[1:2], v[9:10], a[48:63] v_mfma_f32_32x32x8f16 a[160:175], v[1:2], v[13:14], a[48:63] v_mfma_f32_32x32x8f16 a[176:191], v[1:2], v[20:21], a[48:63] v_add_u32_e32 v1, s2, v44 v_lshrrev_b32_e32 v1, 3, v1 v_sub_u32_e32 v1, v1, v45 v_mul_lo_u32 v1, v1, s0 v_add_u32_e32 v1, v29, v1 v_lshl_add_u32 v31, v1, 1, v24 v_mfma_f32_32x32x8f16 a[128:143], v[3:4], v[7:8], a[128:143] v_mfma_f32_32x32x8f16 a[144:159], v[3:4], v[11:12], a[144:159] v_mfma_f32_32x32x8f16 a[160:175], v[3:4], v[15:16], a[160:175] v_mfma_f32_32x32x8f16 a[176:191], v[3:4], v[22:23], a[176:191] ds_read2_b64 v[1:4], v31 offset1:1 s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[112:127], v[1:2], v[5:6], a[48:63] v_mfma_f32_32x32x8f16 a[96:111], v[1:2], v[9:10], a[48:63] v_mfma_f32_32x32x8f16 a[80:95], v[1:2], v[13:14], a[48:63] v_mfma_f32_32x32x8f16 a[64:79], v[1:2], v[20:21], a[48:63] v_add_u32_e32 v1, s3, v44 v_lshrrev_b32_e32 v1, 3, v1 v_sub_u32_e32 v1, v1, v45 v_mul_lo_u32 v1, v1, s0 s_movk_i32 s0, 0x1000 v_add_u32_e32 v1, v29, v1 v_lshl_add_u32 v33, v1, 1, v24 v_mfma_f32_32x32x8f16 a[112:127], v[3:4], v[7:8], a[112:127] v_mfma_f32_32x32x8f16 a[96:111], v[3:4], v[11:12], a[96:111] v_mfma_f32_32x32x8f16 a[80:95], v[3:4], v[15:16], a[80:95] v_mfma_f32_32x32x8f16 a[64:79], v[3:4], v[22:23], a[64:79] ds_read2_b64 v[1:4], v33 offset1:1 s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[0:15], v[1:2], v[5:6], a[48:63] v_add_u32_e32 v5, 0x5000, v25 v_mfma_f32_32x32x8f16 a[16:31], v[1:2], v[9:10], a[48:63] v_add_u32_e32 v9, s0, v26 v_mfma_f32_32x32x8f16 a[32:47], v[1:2], v[13:14], a[48:63] v_add_u32_e32 v13, s0, v27 v_mfma_f32_32x32x8f16 a[48:63], v[1:2], v[20:21], a[48:63] v_add_u32_e32 v1, s0, v24 v_add_u32_e32 v20, s0, v28 v_mfma_f32_32x32x8f16 a[0:15], v[3:4], v[7:8], a[0:15] ds_read2_b64 v[5:8], v5 offset0:160 offset1:161 v_mfma_f32_32x32x8f16 a[16:31], v[3:4], v[11:12], a[16:31] ds_read2_b64 v[9:12], v9 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[32:47], v[3:4], v[15:16], a[32:47] ds_read2_b64 v[13:16], v13 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[48:63], v[3:4], v[22:23], a[48:63] ds_read2_b64 v[1:4], v1 offset0:32 offset1:33 ds_read2_b64 v[20:23], v20 offset0:32 offset1:33 s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[240:255], v[1:2], v[5:6], a[240:255] v_mfma_f32_32x32x8f16 a[224:239], v[1:2], v[9:10], a[224:239] v_mfma_f32_32x32x8f16 a[208:223], v[1:2], v[13:14], a[208:223] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[192:207], v[1:2], v[20:21], a[192:207] v_add_u32_e32 v1, s0, v30 ds_read2_b64 v[24:27], v1 offset0:32 offset1:33 v_add_u32_e32 v1, s0, v31 ds_read2_b64 v[28:31], v1 offset0:32 offset1:33 v_add_u32_e32 v1, s0, v33 ds_read2_b64 v[49:52], v1 offset0:32 offset1:33 s_waitcnt lgkmcnt(2) v_mfma_f32_32x32x8f16 a[128:143], v[24:25], v[5:6], a[128:143] v_mfma_f32_32x32x8f16 a[144:159], v[24:25], v[9:10], a[144:159] v_mfma_f32_32x32x8f16 a[160:175], v[24:25], v[13:14], a[160:175] v_mfma_f32_32x32x8f16 a[176:191], v[24:25], v[20:21], a[176:191] s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[112:127], v[28:29], v[5:6], a[112:127] v_mfma_f32_32x32x8f16 a[96:111], v[28:29], v[9:10], a[96:111] v_mfma_f32_32x32x8f16 a[80:95], v[28:29], v[13:14], a[80:95] v_mfma_f32_32x32x8f16 a[64:79], v[28:29], v[20:21], a[64:79] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[0:15], v[49:50], v[5:6], a[0:15] v_mfma_f32_32x32x8f16 a[16:31], v[49:50], v[9:10], a[16:31] v_mfma_f32_32x32x8f16 a[32:47], v[49:50], v[13:14], a[32:47] v_mfma_f32_32x32x8f16 a[48:63], v[49:50], v[20:21], a[48:63] v_mov_b32_e32 v49, 0 v_mul_i32_i24_e32 v20, s27, v17 v_mov_b32_e32 v50, 0 v_mfma_f32_32x32x8f16 a[240:255], v[3:4], v[7:8], a[240:255] v_mfma_f32_32x32x8f16 a[224:239], v[3:4], v[11:12], a[224:239] s_nop 7 s_nop 7 s_nop 0 v_accvgpr_read_b32 v33, a240 v_accvgpr_read_b32 v34, a241 v_accvgpr_read_b32 v35, a242 v_accvgpr_read_b32 v36, a243 v_accvgpr_read_b32 v37, a244 v_accvgpr_read_b32 v38, a245 v_accvgpr_read_b32 v39, a246 v_accvgpr_read_b32 v40, a247 v_accvgpr_read_b32 v41, a248 v_accvgpr_read_b32 v42, a249 v_accvgpr_read_b32 v43, a250 v_accvgpr_read_b32 v44, a251 v_accvgpr_read_b32 v45, a252 v_mfma_f32_32x32x8f16 a[192:207], v[3:4], v[22:23], a[192:207] v_accvgpr_read_b32 v46, a253 v_accvgpr_read_b32 v47, a254 v_accvgpr_read_b32 v48, a255 v_mfma_f32_32x32x8f16 a[144:159], v[26:27], v[11:12], a[144:159] v_mfma_f32_32x32x8f16 a[176:191], v[26:27], v[22:23], a[176:191] v_mfma_f32_32x32x8f16 a[96:111], v[30:31], v[11:12], a[96:111] v_mfma_f32_32x32x8f16 a[64:79], v[30:31], v[22:23], a[64:79] v_mfma_f32_32x32x8f16 a[16:31], v[51:52], v[11:12], a[16:31] v_mfma_f32_32x32x8f16 a[48:63], v[51:52], v[22:23], a[48:63] v_mfma_f32_32x32x8f16 a[128:143], v[26:27], v[7:8], a[128:143] v_mfma_f32_32x32x8f16 a[112:127], v[30:31], v[7:8], a[112:127] v_mfma_f32_32x32x8f16 a[0:15], v[51:52], v[7:8], a[0:15] v_mfma_f32_32x32x8f16 a[208:223], v[3:4], v[15:16], a[208:223] v_mfma_f32_32x32x8f16 a[80:95], v[30:31], v[15:16], a[80:95] v_mfma_f32_32x32x8f16 a[160:175], v[26:27], v[15:16], a[160:175] v_mfma_f32_32x32x8f16 a[32:47], v[51:52], v[15:16], a[32:47] s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_2 ; %bb.1: v_lshrrev_b32_e32 v1, 2, v0 v_mul_i32_i24_e32 v2, -4, v1 v_add_u32_e32 v1, v20, v1 v_lshlrev_b32_e32 v3, 1, v1 v_add_u32_e32 v4, s6, v17 v_lshl_add_u32 v3, v4, 8, v3 v_mul_lo_u32 v3, v3, s7 v_add_lshl_u32 v2, v2, v0, 4 v_lshlrev_b32_e32 v4, 12, v17 v_lshlrev_b32_e32 v1, 7, v1 v_add3_u32 v50, v2, v4, v1 v_add3_u32 v49, s26, v2, v3 .LBB2_2: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EEC2ERSO_RKNSA_IJiiiiEEES15_S1A_RKS3_.exit.i s_or_b64 exec, exec, s[0:1] v_lshrrev_b32_e32 v0, 3, v0 v_and_or_b32 v0, v0, 4, v18 v_lshlrev_b32_e32 v17, 5, v17 v_lshrrev_b32_e32 v18, 6, v19 v_add3_u32 v0, v0, v20, v17 v_sub_u32_e32 v0, v0, v18 v_lshlrev_b32_e32 v0, 6, v0 v_cvt_f16_f32_e32 v17, v33 v_add_lshl_u32 v51, v0, v19, 1 v_cvt_f16_f32_e32 v0, v34 v_cvt_f16_f32_e32 v18, v35 v_cvt_f16_f32_e32 v19, v36 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v17 ds_write_b16 v51, v0 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v0, v40 v_cvt_f16_f32_e32 v17, v39 v_cvt_f16_f32_e32 v18, v38 v_cvt_f16_f32_e32 v19, v37 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v0, v41 v_cvt_f16_f32_e32 v17, v42 v_cvt_f16_f32_e32 v18, v43 s_load_dword s2, s[4:5], 0x1b0 v_cvt_f16_f32_e32 v19, v44 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v0, v48 v_cvt_f16_f32_e32 v17, v47 v_accvgpr_read_b32 v1, a224 v_cvt_f16_f32_e32 v18, v46 v_accvgpr_read_b32 v2, a225 v_accvgpr_read_b32 v3, a226 v_accvgpr_read_b32 v4, a227 v_accvgpr_read_b32 v5, a228 v_accvgpr_read_b32 v6, a229 v_accvgpr_read_b32 v7, a230 v_accvgpr_read_b32 v8, a231 v_accvgpr_read_b32 v9, a232 v_accvgpr_read_b32 v10, a233 v_accvgpr_read_b32 v11, a234 v_accvgpr_read_b32 v12, a235 v_accvgpr_read_b32 v13, a236 v_accvgpr_read_b32 v14, a237 v_accvgpr_read_b32 v15, a238 v_accvgpr_read_b32 v16, a239 v_cvt_f16_f32_e32 v19, v45 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_4 ; %bb.3: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i.i.i.i v_lshlrev_b32_e32 v0, 1, v50 ds_read_b128 v[17:20], v0 ds_read_b128 v[21:24], v0 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v25, 1, v49 buffer_store_dwordx4 v[17:20], v25, s[24:27], 0 offen ds_read_b128 v[17:20], v0 offset:144 ds_read_b128 v[25:28], v0 offset:128 v_add_u32_e32 v29, 8, v49 v_lshlrev_b32_e32 v30, 1, v29 v_add_lshl_u32 v0, v29, s7, 1 buffer_store_dwordx4 v[21:24], v30, s[24:27], 0 offen s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[17:20], v0, s[24:27], 0 offen v_add_lshl_u32 v0, v49, s7, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[25:28], v0, s[24:27], 0 offen .LBB2_4: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v1 v_cvt_f16_f32_e32 v1, v2 v_cvt_f16_f32_e32 v2, v3 v_cvt_f16_f32_e32 v3, v4 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v7 v_cvt_f16_f32_e32 v2, v6 v_cvt_f16_f32_e32 v3, v5 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v9 v_cvt_f16_f32_e32 v1, v10 v_cvt_f16_f32_e32 v2, v11 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v16 v_cvt_f16_f32_e32 v1, v15 v_accvgpr_read_b32 v33, a208 v_cvt_f16_f32_e32 v2, v14 v_accvgpr_read_b32 v34, a209 v_accvgpr_read_b32 v35, a210 v_accvgpr_read_b32 v36, a211 v_accvgpr_read_b32 v37, a212 v_accvgpr_read_b32 v38, a213 v_accvgpr_read_b32 v39, a214 v_accvgpr_read_b32 v40, a215 v_accvgpr_read_b32 v41, a216 v_accvgpr_read_b32 v42, a217 v_accvgpr_read_b32 v43, a218 v_accvgpr_read_b32 v44, a219 v_accvgpr_read_b32 v45, a220 v_accvgpr_read_b32 v46, a221 v_accvgpr_read_b32 v47, a222 v_accvgpr_read_b32 v48, a223 v_cvt_f16_f32_e32 v3, v13 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_6 ; %bb.5: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i81.i.i.i.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 buffer_store_dwordx4 v[0:3], v9, s[24:27], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 buffer_store_dwordx4 v[4:7], v13, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s7, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[24:27], 0 offen .LBB2_6: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_106.i.i.i.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v33 v_cvt_f16_f32_e32 v1, v34 v_cvt_f16_f32_e32 v2, v35 v_cvt_f16_f32_e32 v3, v36 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v40 v_cvt_f16_f32_e32 v1, v39 v_cvt_f16_f32_e32 v2, v38 v_cvt_f16_f32_e32 v3, v37 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v41 v_cvt_f16_f32_e32 v1, v42 v_cvt_f16_f32_e32 v2, v43 v_cvt_f16_f32_e32 v3, v44 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v48 v_cvt_f16_f32_e32 v1, v47 v_accvgpr_read_b32 v16, a192 v_cvt_f16_f32_e32 v2, v46 v_accvgpr_read_b32 v17, a193 v_accvgpr_read_b32 v18, a194 v_accvgpr_read_b32 v19, a195 v_accvgpr_read_b32 v20, a196 v_accvgpr_read_b32 v21, a197 v_accvgpr_read_b32 v22, a198 v_accvgpr_read_b32 v23, a199 v_accvgpr_read_b32 v24, a200 v_accvgpr_read_b32 v25, a201 v_accvgpr_read_b32 v26, a202 v_accvgpr_read_b32 v27, a203 v_accvgpr_read_b32 v28, a204 v_accvgpr_read_b32 v29, a205 v_accvgpr_read_b32 v30, a206 v_accvgpr_read_b32 v31, a207 v_cvt_f16_f32_e32 v3, v45 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_8 ; %bb.7: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i187.i.i.i.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 buffer_store_dwordx4 v[0:3], v9, s[24:27], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 buffer_store_dwordx4 v[4:7], v13, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s7, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[24:27], 0 offen .LBB2_8: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_212.i.i.i.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a176 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a177 v_accvgpr_read_b32 v2, a178 v_accvgpr_read_b32 v3, a179 v_accvgpr_read_b32 v4, a180 v_accvgpr_read_b32 v5, a181 v_accvgpr_read_b32 v6, a182 v_accvgpr_read_b32 v7, a183 v_accvgpr_read_b32 v8, a184 v_accvgpr_read_b32 v9, a185 v_accvgpr_read_b32 v10, a186 v_accvgpr_read_b32 v11, a187 v_accvgpr_read_b32 v12, a188 v_accvgpr_read_b32 v13, a189 v_accvgpr_read_b32 v14, a190 v_accvgpr_read_b32 v15, a191 v_cvt_f16_f32_e32 v19, v28 s_mul_i32 s3, s7, 63 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_10 ; %bb.9: ; %_ZNK2ck10static_forILi0ELi4ELi1EEclIZZNS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS6_IJiiiEEELb0EEENS7_INS6_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESF_NS_23Merge_v2_magic_divisionINS6_IJiiEEEEESF_NSE_ISB_EENS7_ISH_Lb0EEESF_SJ_EEENS6_IJNS_8SequenceIJLi0EEEENSM_IJLi1EEEENSM_IJLi2EEEENSM_IJLi3EEEENSM_IJLi4ELi6EEEENSM_IJLi7EEEENSM_IJLi5EEEENSM_IJLi8EEEENSM_IJLi9EEEENSM_IJLi10EEEEEEENS6_IJNSM_IJLi1ELi2ELi3EEEENSM_IJLi4ELi5EEEENSM_IJLi6EEEESS_SU_SV_SW_NSM_IJLi11ELi12EEEENSM_IJLi13EEEENSM_IJLi14EEEEEEENSM_IJLi11ELi12ELi13ELi14EEEEiEENS5_INS6_IJNS7_INS6_IJiiiiEEELb0EEESF_NS_3PadIiiiLb0EEES1A_SF_SF_NS_5EmbedISH_SH_Lb0EEES1C_SF_SD_SF_SF_SF_SF_SF_NSG_IS8_EES1D_SJ_SK_SF_SJ_EEENS6_IJSN_SO_SP_SQ_NSM_IJLi4EEEEST_S10_SS_SU_SV_SW_NSM_IJLi11EEEENSM_IJLi12EEEES12_S13_NSM_IJLi15ELi18ELi20EEEENSM_IJLi17ELi19ELi21EEEENSM_IJLi16EEEENSM_IJLi22EEEENSM_IJLi23EEEENSM_IJLi24EEEEEEENS6_IJNSM_IJLi1ELi2ELi3ELi4EEEEST_.i.i.i.i v_lshlrev_b32_e32 v24, 1, v50 ds_read_b128 v[16:19], v24 ds_read_b128 v[20:23], v24 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v25, 1, v49 buffer_store_dwordx4 v[16:19], v25, s[24:27], 0 offen v_add_u32_e32 v25, 8, v49 ds_read_b128 v[16:19], v24 offset:144 v_lshlrev_b32_e32 v26, 1, v25 buffer_store_dwordx4 v[20:23], v26, s[24:27], 0 offen ds_read_b128 v[20:23], v24 offset:128 v_add_lshl_u32 v24, v25, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v24, s[24:27], 0 offen s_nop 0 v_add_u32_e32 v16, s7, v49 v_lshlrev_b32_e32 v17, 1, v16 v_add_u32_e32 v49, s3, v16 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[20:23], v17, s[24:27], 0 offen .LBB2_10: ; %_ZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_I.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a160 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a161 v_accvgpr_read_b32 v18, a162 v_accvgpr_read_b32 v19, a163 v_accvgpr_read_b32 v20, a164 v_accvgpr_read_b32 v21, a165 v_accvgpr_read_b32 v22, a166 v_accvgpr_read_b32 v23, a167 v_accvgpr_read_b32 v24, a168 v_accvgpr_read_b32 v25, a169 v_accvgpr_read_b32 v26, a170 v_accvgpr_read_b32 v27, a171 v_accvgpr_read_b32 v28, a172 v_accvgpr_read_b32 v29, a173 v_accvgpr_read_b32 v30, a174 v_accvgpr_read_b32 v31, a175 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_12 ; %bb.11: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i92.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 buffer_store_dwordx4 v[0:3], v9, s[24:27], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 buffer_store_dwordx4 v[4:7], v13, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s7, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[24:27], 0 offen .LBB2_12: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i140.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a144 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a145 v_accvgpr_read_b32 v2, a146 v_accvgpr_read_b32 v3, a147 v_accvgpr_read_b32 v4, a148 v_accvgpr_read_b32 v5, a149 v_accvgpr_read_b32 v6, a150 v_accvgpr_read_b32 v7, a151 v_accvgpr_read_b32 v8, a152 v_accvgpr_read_b32 v9, a153 v_accvgpr_read_b32 v10, a154 v_accvgpr_read_b32 v11, a155 v_accvgpr_read_b32 v12, a156 v_accvgpr_read_b32 v13, a157 v_accvgpr_read_b32 v14, a158 v_accvgpr_read_b32 v15, a159 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_14 ; %bb.13: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i81.i.i.i192.i.i.i v_lshlrev_b32_e32 v24, 1, v50 ds_read_b128 v[16:19], v24 ds_read_b128 v[20:23], v24 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v25, 1, v49 buffer_store_dwordx4 v[16:19], v25, s[24:27], 0 offen ds_read_b128 v[16:19], v24 offset:144 ds_read_b128 v[24:27], v24 offset:128 v_add_u32_e32 v28, 8, v49 v_lshlrev_b32_e32 v29, 1, v28 buffer_store_dwordx4 v[20:23], v29, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v20, v28, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v20, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v16, v49, s7, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[24:27], v16, s[24:27], 0 offen .LBB2_14: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_106.i.i.i240.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a128 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a129 v_accvgpr_read_b32 v18, a130 v_accvgpr_read_b32 v19, a131 v_accvgpr_read_b32 v20, a132 v_accvgpr_read_b32 v21, a133 v_accvgpr_read_b32 v22, a134 v_accvgpr_read_b32 v23, a135 v_accvgpr_read_b32 v24, a136 v_accvgpr_read_b32 v25, a137 v_accvgpr_read_b32 v26, a138 v_accvgpr_read_b32 v27, a139 v_accvgpr_read_b32 v28, a140 v_accvgpr_read_b32 v29, a141 v_accvgpr_read_b32 v30, a142 v_accvgpr_read_b32 v31, a143 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_16 ; %bb.15: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i187.i.i.i292.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 buffer_store_dwordx4 v[0:3], v9, s[24:27], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 buffer_store_dwordx4 v[4:7], v13, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s7, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[24:27], 0 offen .LBB2_16: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_212.i.i.i340.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a112 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a113 v_accvgpr_read_b32 v2, a114 v_accvgpr_read_b32 v3, a115 v_accvgpr_read_b32 v4, a116 v_accvgpr_read_b32 v5, a117 v_accvgpr_read_b32 v6, a118 v_accvgpr_read_b32 v7, a119 v_accvgpr_read_b32 v8, a120 v_accvgpr_read_b32 v9, a121 v_accvgpr_read_b32 v10, a122 v_accvgpr_read_b32 v11, a123 v_accvgpr_read_b32 v12, a124 v_accvgpr_read_b32 v13, a125 v_accvgpr_read_b32 v14, a126 v_accvgpr_read_b32 v15, a127 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_18 ; %bb.17: ; %_ZNK2ck10static_forILi0ELi4ELi1EEclIZZNS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS6_IJiiiEEELb0EEENS7_INS6_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESF_NS_23Merge_v2_magic_divisionINS6_IJiiEEEEESF_NSE_ISB_EENS7_ISH_Lb0EEESF_SJ_EEENS6_IJNS_8SequenceIJLi0EEEENSM_IJLi1EEEENSM_IJLi2EEEENSM_IJLi3EEEENSM_IJLi4ELi6EEEENSM_IJLi7EEEENSM_IJLi5EEEENSM_IJLi8EEEENSM_IJLi9EEEENSM_IJLi10EEEEEEENS6_IJNSM_IJLi1ELi2ELi3EEEENSM_IJLi4ELi5EEEENSM_IJLi6EEEESS_SU_SV_SW_NSM_IJLi11ELi12EEEENSM_IJLi13EEEENSM_IJLi14EEEEEEENSM_IJLi11ELi12ELi13ELi14EEEEiEENS5_INS6_IJNS7_INS6_IJiiiiEEELb0EEESF_NS_3PadIiiiLb0EEES1A_SF_SF_NS_5EmbedISH_SH_Lb0EEES1C_SF_SD_SF_SF_SF_SF_SF_NSG_IS8_EES1D_SJ_SK_SF_SJ_EEENS6_IJSN_SO_SP_SQ_NSM_IJLi4EEEEST_S10_SS_SU_SV_SW_NSM_IJLi11EEEENSM_IJLi12EEEES12_S13_NSM_IJLi15ELi18ELi20EEEENSM_IJLi17ELi19ELi21EEEENSM_IJLi16EEEENSM_IJLi22EEEENSM_IJLi23EEEENSM_IJLi24EEEEEEENS6_IJNSM_IJLi1ELi2ELi3ELi4EEEEST_.i380.i.i.i v_lshlrev_b32_e32 v24, 1, v50 ds_read_b128 v[16:19], v24 ds_read_b128 v[20:23], v24 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v25, 1, v49 buffer_store_dwordx4 v[16:19], v25, s[24:27], 0 offen v_add_u32_e32 v25, 8, v49 ds_read_b128 v[16:19], v24 offset:144 v_lshlrev_b32_e32 v26, 1, v25 buffer_store_dwordx4 v[20:23], v26, s[24:27], 0 offen ds_read_b128 v[20:23], v24 offset:128 v_add_lshl_u32 v24, v25, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v24, s[24:27], 0 offen s_nop 0 v_add_u32_e32 v16, s7, v49 v_lshlrev_b32_e32 v17, 1, v16 v_add_u32_e32 v49, s3, v16 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[20:23], v17, s[24:27], 0 offen .LBB2_18: ; %_ZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_I405.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a96 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a97 v_accvgpr_read_b32 v18, a98 v_accvgpr_read_b32 v19, a99 v_accvgpr_read_b32 v20, a100 v_accvgpr_read_b32 v21, a101 v_accvgpr_read_b32 v22, a102 v_accvgpr_read_b32 v23, a103 v_accvgpr_read_b32 v24, a104 v_accvgpr_read_b32 v25, a105 v_accvgpr_read_b32 v26, a106 v_accvgpr_read_b32 v27, a107 v_accvgpr_read_b32 v28, a108 v_accvgpr_read_b32 v29, a109 v_accvgpr_read_b32 v30, a110 v_accvgpr_read_b32 v31, a111 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_20 ; %bb.19: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i497.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 buffer_store_dwordx4 v[0:3], v9, s[24:27], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 buffer_store_dwordx4 v[4:7], v13, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s7, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[24:27], 0 offen .LBB2_20: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i545.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a80 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a81 v_accvgpr_read_b32 v2, a82 v_accvgpr_read_b32 v3, a83 v_accvgpr_read_b32 v4, a84 v_accvgpr_read_b32 v5, a85 v_accvgpr_read_b32 v6, a86 v_accvgpr_read_b32 v7, a87 v_accvgpr_read_b32 v8, a88 v_accvgpr_read_b32 v9, a89 v_accvgpr_read_b32 v10, a90 v_accvgpr_read_b32 v11, a91 v_accvgpr_read_b32 v12, a92 v_accvgpr_read_b32 v13, a93 v_accvgpr_read_b32 v14, a94 v_accvgpr_read_b32 v15, a95 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_22 ; %bb.21: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i81.i.i.i597.i.i.i v_lshlrev_b32_e32 v24, 1, v50 ds_read_b128 v[16:19], v24 ds_read_b128 v[20:23], v24 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v25, 1, v49 buffer_store_dwordx4 v[16:19], v25, s[24:27], 0 offen ds_read_b128 v[16:19], v24 offset:144 ds_read_b128 v[24:27], v24 offset:128 v_add_u32_e32 v28, 8, v49 v_lshlrev_b32_e32 v29, 1, v28 buffer_store_dwordx4 v[20:23], v29, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v20, v28, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v20, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v16, v49, s7, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[24:27], v16, s[24:27], 0 offen .LBB2_22: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_106.i.i.i645.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a64 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a65 v_accvgpr_read_b32 v18, a66 v_accvgpr_read_b32 v19, a67 v_accvgpr_read_b32 v20, a68 v_accvgpr_read_b32 v21, a69 v_accvgpr_read_b32 v22, a70 v_accvgpr_read_b32 v23, a71 v_accvgpr_read_b32 v24, a72 v_accvgpr_read_b32 v25, a73 v_accvgpr_read_b32 v26, a74 v_accvgpr_read_b32 v27, a75 v_accvgpr_read_b32 v28, a76 v_accvgpr_read_b32 v29, a77 v_accvgpr_read_b32 v30, a78 v_accvgpr_read_b32 v31, a79 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_24 ; %bb.23: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i187.i.i.i697.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 buffer_store_dwordx4 v[0:3], v9, s[24:27], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 buffer_store_dwordx4 v[4:7], v13, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s7, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[24:27], 0 offen .LBB2_24: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_212.i.i.i745.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a48 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a49 v_accvgpr_read_b32 v2, a50 v_accvgpr_read_b32 v3, a51 v_accvgpr_read_b32 v4, a52 v_accvgpr_read_b32 v5, a53 v_accvgpr_read_b32 v6, a54 v_accvgpr_read_b32 v7, a55 v_accvgpr_read_b32 v8, a56 v_accvgpr_read_b32 v9, a57 v_accvgpr_read_b32 v10, a58 v_accvgpr_read_b32 v11, a59 v_accvgpr_read_b32 v12, a60 v_accvgpr_read_b32 v13, a61 v_accvgpr_read_b32 v14, a62 v_accvgpr_read_b32 v15, a63 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_26 ; %bb.25: ; %_ZNK2ck10static_forILi0ELi4ELi1EEclIZZNS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS6_IJiiiEEELb0EEENS7_INS6_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESF_NS_23Merge_v2_magic_divisionINS6_IJiiEEEEESF_NSE_ISB_EENS7_ISH_Lb0EEESF_SJ_EEENS6_IJNS_8SequenceIJLi0EEEENSM_IJLi1EEEENSM_IJLi2EEEENSM_IJLi3EEEENSM_IJLi4ELi6EEEENSM_IJLi7EEEENSM_IJLi5EEEENSM_IJLi8EEEENSM_IJLi9EEEENSM_IJLi10EEEEEEENS6_IJNSM_IJLi1ELi2ELi3EEEENSM_IJLi4ELi5EEEENSM_IJLi6EEEESS_SU_SV_SW_NSM_IJLi11ELi12EEEENSM_IJLi13EEEENSM_IJLi14EEEEEEENSM_IJLi11ELi12ELi13ELi14EEEEiEENS5_INS6_IJNS7_INS6_IJiiiiEEELb0EEESF_NS_3PadIiiiLb0EEES1A_SF_SF_NS_5EmbedISH_SH_Lb0EEES1C_SF_SD_SF_SF_SF_SF_SF_NSG_IS8_EES1D_SJ_SK_SF_SJ_EEENS6_IJSN_SO_SP_SQ_NSM_IJLi4EEEEST_S10_SS_SU_SV_SW_NSM_IJLi11EEEENSM_IJLi12EEEES12_S13_NSM_IJLi15ELi18ELi20EEEENSM_IJLi17ELi19ELi21EEEENSM_IJLi16EEEENSM_IJLi22EEEENSM_IJLi23EEEENSM_IJLi24EEEEEEENS6_IJNSM_IJLi1ELi2ELi3ELi4EEEEST_.i785.i.i.i v_lshlrev_b32_e32 v24, 1, v50 ds_read_b128 v[16:19], v24 ds_read_b128 v[20:23], v24 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v25, 1, v49 buffer_store_dwordx4 v[16:19], v25, s[24:27], 0 offen v_add_u32_e32 v25, 8, v49 ds_read_b128 v[16:19], v24 offset:144 v_lshlrev_b32_e32 v26, 1, v25 buffer_store_dwordx4 v[20:23], v26, s[24:27], 0 offen ds_read_b128 v[20:23], v24 offset:128 v_add_lshl_u32 v24, v25, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v24, s[24:27], 0 offen s_nop 0 v_add_u32_e32 v16, s7, v49 v_lshlrev_b32_e32 v17, 1, v16 v_add_u32_e32 v49, s3, v16 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[20:23], v17, s[24:27], 0 offen .LBB2_26: ; %_ZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_I810.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a32 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a33 v_accvgpr_read_b32 v18, a34 v_accvgpr_read_b32 v19, a35 v_accvgpr_read_b32 v20, a36 v_accvgpr_read_b32 v21, a37 v_accvgpr_read_b32 v22, a38 v_accvgpr_read_b32 v23, a39 v_accvgpr_read_b32 v24, a40 v_accvgpr_read_b32 v25, a41 v_accvgpr_read_b32 v26, a42 v_accvgpr_read_b32 v27, a43 v_accvgpr_read_b32 v28, a44 v_accvgpr_read_b32 v29, a45 v_accvgpr_read_b32 v30, a46 v_accvgpr_read_b32 v31, a47 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_28 ; %bb.27: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i902.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 buffer_store_dwordx4 v[0:3], v9, s[24:27], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 buffer_store_dwordx4 v[4:7], v13, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s7, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[24:27], 0 offen .LBB2_28: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i950.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a16 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a17 v_accvgpr_read_b32 v2, a18 v_accvgpr_read_b32 v3, a19 v_accvgpr_read_b32 v4, a20 v_accvgpr_read_b32 v5, a21 v_accvgpr_read_b32 v6, a22 v_accvgpr_read_b32 v7, a23 v_accvgpr_read_b32 v8, a24 v_accvgpr_read_b32 v9, a25 v_accvgpr_read_b32 v10, a26 v_accvgpr_read_b32 v11, a27 v_accvgpr_read_b32 v12, a28 v_accvgpr_read_b32 v13, a29 v_accvgpr_read_b32 v14, a30 v_accvgpr_read_b32 v15, a31 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_30 ; %bb.29: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i81.i.i.i1002.i.i.i v_lshlrev_b32_e32 v24, 1, v50 ds_read_b128 v[16:19], v24 ds_read_b128 v[20:23], v24 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v25, 1, v49 buffer_store_dwordx4 v[16:19], v25, s[24:27], 0 offen ds_read_b128 v[16:19], v24 offset:144 ds_read_b128 v[24:27], v24 offset:128 v_add_u32_e32 v28, 8, v49 v_lshlrev_b32_e32 v29, 1, v28 buffer_store_dwordx4 v[20:23], v29, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v20, v28, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[16:19], v20, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v16, v49, s7, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[24:27], v16, s[24:27], 0 offen .LBB2_30: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_106.i.i.i1050.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v31, a15 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v30, a14 v_accvgpr_read_b32 v29, a13 v_accvgpr_read_b32 v28, a12 v_accvgpr_read_b32 v27, a11 v_accvgpr_read_b32 v26, a10 v_accvgpr_read_b32 v25, a9 v_accvgpr_read_b32 v24, a8 v_accvgpr_read_b32 v23, a7 v_accvgpr_read_b32 v22, a6 v_accvgpr_read_b32 v21, a5 v_accvgpr_read_b32 v20, a4 v_accvgpr_read_b32 v19, a3 v_accvgpr_read_b32 v18, a2 v_accvgpr_read_b32 v17, a1 v_accvgpr_read_b32 v16, a0 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_32 ; %bb.31: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE0ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i187.i.i.i1102.i.i.i v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 buffer_store_dwordx4 v[0:3], v9, s[24:27], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 buffer_store_dwordx4 v[4:7], v13, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s7, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[24:27], 0 offen .LBB2_32: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_212.i.i.i1150.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v16 v_cvt_f16_f32_e32 v1, v17 v_cvt_f16_f32_e32 v2, v18 v_cvt_f16_f32_e32 v3, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v23 v_cvt_f16_f32_e32 v1, v22 v_cvt_f16_f32_e32 v2, v21 v_cvt_f16_f32_e32 v3, v20 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v24 v_cvt_f16_f32_e32 v1, v25 v_cvt_f16_f32_e32 v2, v26 v_cvt_f16_f32_e32 v3, v27 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v31 v_cvt_f16_f32_e32 v1, v30 v_cvt_f16_f32_e32 v2, v29 v_cvt_f16_f32_e32 v3, v28 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB2_34 ; %bb.33: v_lshlrev_b32_e32 v8, 1, v50 ds_read_b128 v[0:3], v8 ds_read_b128 v[4:7], v8 offset:16 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v9, 1, v49 buffer_store_dwordx4 v[0:3], v9, s[24:27], 0 offen ds_read_b128 v[0:3], v8 offset:144 ds_read_b128 v[8:11], v8 offset:128 v_add_u32_e32 v12, 8, v49 v_lshlrev_b32_e32 v13, 1, v12 buffer_store_dwordx4 v[4:7], v13, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v4, v12, s7, 1 s_waitcnt lgkmcnt(1) buffer_store_dwordx4 v[0:3], v4, s[24:27], 0 offen s_nop 0 v_add_lshl_u32 v0, v49, s7, 1 s_waitcnt lgkmcnt(0) buffer_store_dwordx4 v[8:11], v0, s[24:27], 0 offen .LBB2_34: ; %_ZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_IJ s_endpgm .section .rodata,#alloc .p2align 6 .amdhsa_kernel _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .amdhsa_group_segment_fixed_size 34816 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 544 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 256 .amdhsa_next_free_sgpr 59 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_,#alloc,#execinstr .Lfunc_end2: .size _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_, .Lfunc_end2-_ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 11120 ; NumSgprs: 61 ; NumVgprs: 53 ; NumAgprs: 256 ; TotalNumVgprs: 256 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 34816 bytes/workgroup (compile time only) ; SGPRBlocks: 7 ; VGPRBlocks: 63 ; NumSGPRsForWavesPerEU: 61 ; NumVGPRsForWavesPerEU: 256 ; Occupancy: 1 ; WaveLimiterHint : 0 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 .section .text._ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_,#alloc,#execinstr .protected _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ ; -- Begin function _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .globl _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .p2align 8 .type _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_,@function _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_: ; @_ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ ; %bb.0: s_load_dwordx2 s[0:1], s[4:5], 0x0 s_load_dwordx2 s[8:9], s[4:5], 0x8 s_load_dwordx2 s[24:25], s[4:5], 0x10 s_load_dwordx2 s[10:11], s[4:5], 0x24 s_load_dword s33, s[4:5], 0x48 s_load_dword s26, s[4:5], 0x50 s_load_dword s54, s[4:5], 0x58 s_load_dword s27, s[4:5], 0x70 s_load_dword s55, s[4:5], 0x84 s_load_dwordx4 s[12:15], s[4:5], 0x98 s_load_dwordx4 s[16:19], s[4:5], 0xac s_load_dwordx2 s[20:21], s[4:5], 0xbc s_load_dwordx2 s[2:3], s[4:5], 0xd4 s_load_dwordx2 s[22:23], s[4:5], 0xe4 s_load_dwordx2 s[34:35], s[4:5], 0x114 s_load_dwordx2 s[44:45], s[4:5], 0x120 s_load_dwordx2 s[46:47], s[4:5], 0x12c s_load_dwordx2 s[48:49], s[4:5], 0x13c s_load_dwordx2 s[50:51], s[4:5], 0x148 s_load_dwordx4 s[28:31], s[4:5], 0x1e0 s_load_dwordx4 s[36:39], s[4:5], 0x1f4 s_load_dwordx4 s[40:43], s[4:5], 0x208 s_load_dwordx2 s[52:53], s[4:5], 0x154 v_lshrrev_b32_e32 v1, 5, v0 v_lshrrev_b32_e32 v17, 7, v0 s_waitcnt lgkmcnt(0) s_mul_hi_u32 s39, s39, s6 s_add_i32 s39, s6, s39 s_lshr_b32 s39, s39, s43 s_mul_i32 s31, s39, s31 s_sub_i32 s6, s6, s31 s_mul_hi_u32 s31, s39, s38 s_add_i32 s31, s39, s31 s_lshr_b32 s31, s31, s42 s_mul_hi_u32 s37, s31, s37 s_add_i32 s37, s31, s37 s_lshr_b32 s37, s37, s41 s_mul_i32 s29, s37, s29 s_mul_i32 s30, s31, s30 s_sub_i32 s29, s31, s29 s_mul_hi_u32 s31, s37, s36 s_add_i32 s31, s37, s31 s_lshr_b32 s31, s31, s40 v_mad_i32_i24 v18, v17, -4, v1 s_mul_i32 s27, s31, s27 v_add_u32_e32 v2, s27, v18 v_mul_hi_u32 v3, v2, s26 s_load_dword s15, s[4:5], 0x16c s_load_dword s56, s[4:5], 0x180 s_load_dword s7, s[4:5], 0x18c s_load_dword s57, s[4:5], 0x1c4 s_load_dword s58, s[4:5], 0x1d4 s_movk_i32 s27, 0xffe0 v_mad_i32_i24 v19, v1, s27, v0 v_add_u32_e32 v3, v2, v3 v_lshrrev_b32_e32 v3, s54, v3 v_mul_lo_u32 v4, v3, s33 s_waitcnt lgkmcnt(0) s_mul_i32 s29, s29, s58 s_add_i32 s29, s6, s29 s_lshl_b32 s26, s29, 8 v_lshlrev_b32_e32 v1, 3, v19 v_sub_u32_e32 v2, v2, v4 v_add_u32_e32 v4, s26, v1 v_mul_hi_u32 v5, v4, s51 s_mul_i32 s28, s31, s28 v_lshlrev_b32_e32 v37, 2, v17 s_mul_i32 s31, s31, s15 s_sub_i32 s28, s37, s28 v_lshl_or_b32 v3, v3, 3, v37 v_add_u32_e32 v6, s31, v18 s_sub_i32 s30, s39, s30 s_mul_i32 s6, s28, s57 v_mul_lo_u32 v3, v3, s10 v_mul_lo_u32 v2, v2, s11 v_add_u32_e32 v5, v4, v5 v_mul_hi_u32 v8, v6, s45 s_add_i32 s6, s6, s30 v_lshrrev_b32_e32 v5, s53, v5 s_lshl_b32 s28, s6, 8 v_mul_hi_u32 v7, v5, s50 v_add_u32_e32 v1, s28, v1 v_add3_u32 v1, v1, v3, v2 v_add_u32_e32 v3, v6, v8 v_lshrrev_b32_e32 v3, s47, v3 v_add_u32_e32 v2, v5, v7 v_mul_hi_u32 v7, v3, s44 v_lshrrev_b32_e32 v2, s52, v2 v_mul_lo_u32 v8, v2, s48 v_mul_lo_u32 v9, v3, s35 v_add_u32_e32 v7, v3, v7 v_lshrrev_b32_e32 v7, s46, v7 v_mul_lo_u32 v10, v7, s34 v_mul_lo_u32 v11, v5, s49 v_sub_u32_e32 v5, v5, v8 v_sub_u32_e32 v6, v6, v9 v_sub_u32_e32 v3, v3, v10 v_mul_lo_u32 v5, v5, s22 v_mul_lo_u32 v6, v6, s23 v_mul_lo_u32 v2, v2, s2 v_mul_lo_u32 v3, v3, s3 v_add_u32_e32 v9, s10, v1 v_add_u32_e32 v21, v6, v5 v_lshlrev_b32_e32 v5, 1, v9 v_add_u32_e32 v22, v3, v2 v_subrev_u32_e32 v2, s20, v21 v_add_u32_e32 v9, s10, v9 v_sub_u32_e32 v20, v4, v11 v_lshl_or_b32 v4, v7, 3, v37 v_subrev_u32_e32 v3, s17, v22 v_mul_lo_u32 v25, v2, s14 s_lshl_b32 s2, s55, 1 s_mov_b32 s3, 0x20000 v_lshlrev_b32_e32 v2, 1, v1 v_lshlrev_b32_e32 v26, 1, v9 v_mul_lo_u32 v23, v4, s12 v_mul_lo_u32 v24, v3, s13 buffer_load_dwordx4 v[1:4], v2, s[0:3], 0 offen s_nop 0 buffer_load_dwordx4 v[5:8], v5, s[0:3], 0 offen v_add_lshl_u32 v27, v9, s10, 1 buffer_load_dwordx4 v[9:12], v26, s[0:3], 0 offen buffer_load_dwordx4 v[13:16], v27, s[0:3], 0 offen s_sub_i32 s0, s19, s21 v_cmp_le_i32_e32 vcc, s20, v21 v_cmp_gt_i32_e64 s[0:1], s0, v21 s_and_b64 s[14:15], vcc, s[0:1] s_sub_i32 s0, s16, s18 v_cmp_le_i32_e32 vcc, s17, v22 v_cmp_gt_i32_e64 s[0:1], s0, v22 s_and_b64 s[0:1], vcc, s[0:1] v_add_u32_e32 v20, v20, v25 v_bfrev_b32_e32 v21, -2 s_and_b64 s[0:1], s[14:15], s[0:1] v_add3_u32 v20, v20, v23, v24 v_cndmask_b32_e64 v28, v21, 0, s[0:1] s_lshl_b32 s10, s56, 1 s_mov_b32 s11, s3 v_lshl_add_u32 v29, v20, 1, v28 v_add_u32_e32 v30, s12, v20 v_lshl_add_u32 v31, v30, 1, v28 buffer_load_dwordx4 v[20:23], v29, s[8:11], 0 offen buffer_load_dwordx4 v[24:27], v31, s[8:11], 0 offen v_add_u32_e32 v29, s12, v30 v_lshl_add_u32 v38, v29, 1, v28 v_add_u32_e32 v29, s12, v29 v_lshl_add_u32 v39, v29, 1, v28 buffer_load_dwordx4 v[28:31], v38, s[8:11], 0 offen buffer_load_dwordx4 v[33:36], v39, s[8:11], 0 offen s_movk_i32 s1, 0x880 v_mul_lo_u32 v18, v18, s1 s_movk_i32 s0, 0x44 v_mul_lo_u32 v41, v19, s0 v_and_b32_e32 v19, 32, v0 v_or_b32_e32 v42, v18, v37 v_and_b32_e32 v18, 63, v0 v_sub_u32_e32 v19, v18, v19 v_lshrrev_b32_e32 v18, 4, v0 v_and_b32_e32 v43, 2, v18 v_lshlrev_b32_e32 v18, 5, v17 v_add_u32_e32 v44, v19, v18 v_ashrrev_i16_e32 v37, 15, v44 v_lshrrev_b16_e32 v37, 13, v37 v_add_u16_e32 v37, v44, v37 v_ashrrev_i16_e32 v38, 3, v37 v_and_b32_e32 v37, -8, v37 v_sub_u16_e32 v37, v44, v37 v_bfe_i32 v45, v38, 0, 16 v_bfe_i32 v46, v37, 0, 16 v_mul_u32_u24_e32 v37, s1, v43 v_mad_i32_i24 v48, v45, s0, v37 v_lshrrev_b32_e32 v37, 6, v0 v_mad_i32_i24 v37, v17, -2, v37 v_lshl_add_u32 v19, v37, 5, v19 v_ashrrev_i32_e32 v37, 31, v19 v_lshrrev_b32_e32 v37, 29, v37 v_add_u32_e32 v49, v19, v37 v_ashrrev_i32_e32 v50, 3, v49 s_waitcnt vmcnt(6) ;;#ASMSTART v_pack_b32_f16 v37, v1, v5 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v39, v1, v5, op_sel:[1, 1] ;;#ASMEND s_waitcnt vmcnt(4) ;;#ASMSTART v_pack_b32_f16 v38, v9, v13 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v40, v9, v13, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v1, v2, v6 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v5, v2, v6, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v2, v10, v14 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v6, v10, v14, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v9, v3, v7 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v13, v3, v7, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v10, v11, v15 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v14, v11, v15, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v3, v4, v8 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v7, v4, v8, op_sel:[1, 1] ;;#ASMEND v_mul_lo_u32 v4, v50, s0 s_mov_b32 s8, 0 s_mov_b32 s23, s8 v_and_b32_e32 v8, -8, v49 s_mov_b32 s9, s8 s_mov_b32 s10, s8 s_mov_b32 s11, s8 s_mov_b32 s12, s8 s_mov_b32 s13, s8 s_mov_b32 s14, s8 s_mov_b32 s15, s8 s_mov_b32 s16, s8 s_mov_b32 s17, s8 s_mov_b32 s18, s8 s_mov_b32 s19, s8 s_mov_b32 s20, s8 s_mov_b32 s21, s8 s_mov_b32 s22, s8 v_mov_b32_e32 v32, s23 v_sub_u32_e32 v49, v19, v8 v_mad_u32_u24 v43, v43, s1, v4 ;;#ASMSTART v_pack_b32_f16 v4, v12, v16 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v8, v12, v16, op_sel:[1, 1] ;;#ASMEND v_add_lshl_u32 v41, v42, v41, 1 s_movk_i32 s1, 0x4000 ds_write2_b64 v41, v[37:38], v[39:40] offset1:2 ds_write2_b64 v41, v[1:2], v[5:6] offset0:4 offset1:6 ds_write2_b64 v41, v[9:10], v[13:14] offset0:8 offset1:10 ds_write2_b64 v41, v[3:4], v[7:8] offset0:12 offset1:14 s_waitcnt vmcnt(2) ;;#ASMSTART v_pack_b32_f16 v1, v20, v24 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v3, v20, v24, op_sel:[1, 1] ;;#ASMEND s_waitcnt vmcnt(0) ;;#ASMSTART v_pack_b32_f16 v2, v28, v33 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v4, v28, v33, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v5, v21, v25 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v7, v21, v25, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v6, v29, v34 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v8, v29, v34, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v9, v22, v26 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v11, v22, v26, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v10, v30, v35 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v12, v30, v35, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v13, v23, v27 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v15, v23, v27, op_sel:[1, 1] ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v14, v31, v36 ;;#ASMEND ;;#ASMSTART v_pack_b32_f16 v16, v31, v36, op_sel:[1, 1] ;;#ASMEND v_add_u32_e32 v20, s1, v41 ds_write2_b64 v20, v[1:2], v[3:4] offset0:128 offset1:130 ds_write2_b64 v20, v[5:6], v[7:8] offset0:132 offset1:134 ds_write2_b64 v20, v[9:10], v[11:12] offset0:136 offset1:138 ds_write2_b64 v20, v[13:14], v[15:16] offset0:140 offset1:142 v_accvgpr_write_b32 a63, v32 v_mov_b32_e32 v23, s22 v_mov_b32_e32 v16, s21 v_mov_b32_e32 v32, s20 v_accvgpr_write_b32 a62, v23 v_accvgpr_write_b32 a61, v16 v_accvgpr_write_b32 a60, v32 v_mov_b32_e32 v23, s19 v_mov_b32_e32 v16, s18 v_mov_b32_e32 v32, s17 v_lshlrev_b32_e32 v47, 3, v46 v_accvgpr_write_b32 a59, v23 v_accvgpr_write_b32 a58, v16 v_accvgpr_write_b32 a57, v32 v_mov_b32_e32 v23, s16 v_mov_b32_e32 v16, s15 v_mov_b32_e32 v32, s14 v_add_lshl_u32 v24, v48, v47, 1 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_read2_b64 v[1:4], v24 offset1:1 v_lshlrev_b32_e32 v51, 3, v49 v_accvgpr_write_b32 a56, v23 v_accvgpr_write_b32 a55, v16 v_accvgpr_write_b32 a54, v32 v_mov_b32_e32 v23, s13 v_mov_b32_e32 v16, s12 v_mov_b32_e32 v32, s11 v_add_lshl_u32 v25, v43, v51, 1 v_add_u32_e32 v5, s1, v25 ds_read2_b64 v[5:8], v5 offset0:128 offset1:129 v_accvgpr_write_b32 a53, v23 v_accvgpr_write_b32 a52, v16 v_accvgpr_write_b32 a51, v32 v_mov_b32_e32 v23, s10 v_mov_b32_e32 v16, s9 v_mov_b32_e32 v32, s8 v_accvgpr_write_b32 a50, v23 v_accvgpr_write_b32 a49, v16 v_accvgpr_write_b32 a48, v32 v_add_u32_e32 v9, 64, v19 s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[240:255], v[1:2], v[5:6], a[48:63] s_movk_i32 s2, 0x80 s_movk_i32 s3, 0xc0 v_add_u32_e32 v13, s2, v19 v_add_u32_e32 v21, s3, v19 v_ashrrev_i32_e32 v10, 31, v9 v_ashrrev_i32_e32 v14, 31, v13 v_ashrrev_i32_e32 v22, 31, v21 v_lshrrev_b32_e32 v10, 29, v10 v_lshrrev_b32_e32 v14, 29, v14 v_lshrrev_b32_e32 v22, 29, v22 v_add_u32_e32 v10, v9, v10 v_add_u32_e32 v14, v13, v14 v_add_u32_e32 v22, v21, v22 v_ashrrev_i32_e32 v11, 3, v10 v_ashrrev_i32_e32 v15, 3, v14 v_ashrrev_i32_e32 v23, 3, v22 v_sub_u32_e32 v11, v11, v50 v_sub_u32_e32 v15, v15, v50 v_sub_u32_e32 v23, v23, v50 s_mov_b32 s1, 0xffffff8 v_mul_lo_u32 v11, v11, s0 v_mul_lo_u32 v15, v15, s0 v_mul_lo_u32 v23, v23, s0 v_and_b32_e32 v10, s1, v10 v_and_b32_e32 v14, s1, v14 v_and_b32_e32 v22, s1, v22 v_sub_u32_e32 v9, v9, v10 v_sub_u32_e32 v13, v13, v14 v_sub_u32_e32 v21, v21, v22 v_sub_u32_e32 v9, v9, v49 v_sub_u32_e32 v13, v13, v49 v_sub_u32_e32 v21, v21, v49 v_add_u32_e32 v20, 0x4400, v25 v_lshl_add_u32 v9, v9, 3, v11 v_lshl_add_u32 v13, v13, 3, v15 v_lshl_add_u32 v21, v21, 3, v23 v_lshl_add_u32 v26, v9, 1, v20 v_lshl_add_u32 v27, v13, 1, v20 v_lshl_add_u32 v28, v21, 1, v20 ds_read2_b64 v[9:12], v26 offset1:1 ds_read2_b64 v[13:16], v27 offset1:1 ds_read2_b64 v[20:23], v28 offset1:1 s_waitcnt lgkmcnt(2) v_mfma_f32_32x32x8f16 a[224:239], v[1:2], v[9:10], a[48:63] v_cmp_gt_u32_e32 vcc, s2, v0 s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[208:223], v[1:2], v[13:14], a[48:63] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[192:207], v[1:2], v[20:21], a[48:63] v_add_u32_e32 v1, 64, v44 v_lshrrev_b32_e32 v1, 3, v1 v_sub_u32_e32 v1, v1, v45 v_mul_lo_u32 v1, v1, s0 v_and_b32_e32 v2, 7, v44 v_sub_u32_e32 v2, v2, v46 v_lshlrev_b32_e32 v29, 3, v2 v_add_u32_e32 v1, v29, v1 v_lshl_add_u32 v30, v1, 1, v24 v_mfma_f32_32x32x8f16 a[240:255], v[3:4], v[7:8], a[240:255] v_mfma_f32_32x32x8f16 a[224:239], v[3:4], v[11:12], a[224:239] v_mfma_f32_32x32x8f16 a[208:223], v[3:4], v[15:16], a[208:223] v_mfma_f32_32x32x8f16 a[192:207], v[3:4], v[22:23], a[192:207] ds_read2_b64 v[1:4], v30 offset1:1 s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[128:143], v[1:2], v[5:6], a[48:63] v_mfma_f32_32x32x8f16 a[144:159], v[1:2], v[9:10], a[48:63] v_mfma_f32_32x32x8f16 a[160:175], v[1:2], v[13:14], a[48:63] v_mfma_f32_32x32x8f16 a[176:191], v[1:2], v[20:21], a[48:63] v_add_u32_e32 v1, s2, v44 v_lshrrev_b32_e32 v1, 3, v1 v_sub_u32_e32 v1, v1, v45 v_mul_lo_u32 v1, v1, s0 v_add_u32_e32 v1, v29, v1 v_lshl_add_u32 v31, v1, 1, v24 v_mfma_f32_32x32x8f16 a[128:143], v[3:4], v[7:8], a[128:143] v_mfma_f32_32x32x8f16 a[144:159], v[3:4], v[11:12], a[144:159] v_mfma_f32_32x32x8f16 a[160:175], v[3:4], v[15:16], a[160:175] v_mfma_f32_32x32x8f16 a[176:191], v[3:4], v[22:23], a[176:191] ds_read2_b64 v[1:4], v31 offset1:1 s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[112:127], v[1:2], v[5:6], a[48:63] v_mfma_f32_32x32x8f16 a[96:111], v[1:2], v[9:10], a[48:63] v_mfma_f32_32x32x8f16 a[80:95], v[1:2], v[13:14], a[48:63] v_mfma_f32_32x32x8f16 a[64:79], v[1:2], v[20:21], a[48:63] v_add_u32_e32 v1, s3, v44 v_lshrrev_b32_e32 v1, 3, v1 v_sub_u32_e32 v1, v1, v45 v_mul_lo_u32 v1, v1, s0 s_movk_i32 s0, 0x1000 v_add_u32_e32 v1, v29, v1 v_lshl_add_u32 v33, v1, 1, v24 v_mfma_f32_32x32x8f16 a[112:127], v[3:4], v[7:8], a[112:127] v_mfma_f32_32x32x8f16 a[96:111], v[3:4], v[11:12], a[96:111] v_mfma_f32_32x32x8f16 a[80:95], v[3:4], v[15:16], a[80:95] v_mfma_f32_32x32x8f16 a[64:79], v[3:4], v[22:23], a[64:79] ds_read2_b64 v[1:4], v33 offset1:1 s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[0:15], v[1:2], v[5:6], a[48:63] v_add_u32_e32 v5, 0x5000, v25 v_mfma_f32_32x32x8f16 a[16:31], v[1:2], v[9:10], a[48:63] v_add_u32_e32 v9, s0, v26 v_mfma_f32_32x32x8f16 a[32:47], v[1:2], v[13:14], a[48:63] v_add_u32_e32 v13, s0, v27 v_mfma_f32_32x32x8f16 a[48:63], v[1:2], v[20:21], a[48:63] v_add_u32_e32 v1, s0, v24 v_add_u32_e32 v20, s0, v28 v_mfma_f32_32x32x8f16 a[0:15], v[3:4], v[7:8], a[0:15] ds_read2_b64 v[5:8], v5 offset0:160 offset1:161 v_mfma_f32_32x32x8f16 a[16:31], v[3:4], v[11:12], a[16:31] ds_read2_b64 v[9:12], v9 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[32:47], v[3:4], v[15:16], a[32:47] ds_read2_b64 v[13:16], v13 offset0:32 offset1:33 v_mfma_f32_32x32x8f16 a[48:63], v[3:4], v[22:23], a[48:63] ds_read2_b64 v[1:4], v1 offset0:32 offset1:33 ds_read2_b64 v[20:23], v20 offset0:32 offset1:33 s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[240:255], v[1:2], v[5:6], a[240:255] v_mfma_f32_32x32x8f16 a[224:239], v[1:2], v[9:10], a[224:239] v_mfma_f32_32x32x8f16 a[208:223], v[1:2], v[13:14], a[208:223] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[192:207], v[1:2], v[20:21], a[192:207] v_add_u32_e32 v1, s0, v30 ds_read2_b64 v[24:27], v1 offset0:32 offset1:33 v_add_u32_e32 v1, s0, v31 ds_read2_b64 v[28:31], v1 offset0:32 offset1:33 v_add_u32_e32 v1, s0, v33 ds_read2_b64 v[49:52], v1 offset0:32 offset1:33 s_waitcnt lgkmcnt(2) v_mfma_f32_32x32x8f16 a[128:143], v[24:25], v[5:6], a[128:143] v_mfma_f32_32x32x8f16 a[144:159], v[24:25], v[9:10], a[144:159] v_mfma_f32_32x32x8f16 a[160:175], v[24:25], v[13:14], a[160:175] v_mfma_f32_32x32x8f16 a[176:191], v[24:25], v[20:21], a[176:191] s_waitcnt lgkmcnt(1) v_mfma_f32_32x32x8f16 a[112:127], v[28:29], v[5:6], a[112:127] v_mfma_f32_32x32x8f16 a[96:111], v[28:29], v[9:10], a[96:111] v_mfma_f32_32x32x8f16 a[80:95], v[28:29], v[13:14], a[80:95] v_mfma_f32_32x32x8f16 a[64:79], v[28:29], v[20:21], a[64:79] s_waitcnt lgkmcnt(0) v_mfma_f32_32x32x8f16 a[0:15], v[49:50], v[5:6], a[0:15] v_mfma_f32_32x32x8f16 a[16:31], v[49:50], v[9:10], a[16:31] v_mfma_f32_32x32x8f16 a[32:47], v[49:50], v[13:14], a[32:47] v_mfma_f32_32x32x8f16 a[48:63], v[49:50], v[20:21], a[48:63] v_mov_b32_e32 v49, 0 v_mul_i32_i24_e32 v20, s27, v17 v_mov_b32_e32 v50, 0 v_mfma_f32_32x32x8f16 a[240:255], v[3:4], v[7:8], a[240:255] v_mfma_f32_32x32x8f16 a[224:239], v[3:4], v[11:12], a[224:239] s_nop 7 s_nop 7 s_nop 0 v_accvgpr_read_b32 v33, a240 v_accvgpr_read_b32 v34, a241 v_accvgpr_read_b32 v35, a242 v_accvgpr_read_b32 v36, a243 v_accvgpr_read_b32 v37, a244 v_accvgpr_read_b32 v38, a245 v_accvgpr_read_b32 v39, a246 v_accvgpr_read_b32 v40, a247 v_accvgpr_read_b32 v41, a248 v_accvgpr_read_b32 v42, a249 v_accvgpr_read_b32 v43, a250 v_accvgpr_read_b32 v44, a251 v_accvgpr_read_b32 v45, a252 v_mfma_f32_32x32x8f16 a[192:207], v[3:4], v[22:23], a[192:207] v_accvgpr_read_b32 v46, a253 v_accvgpr_read_b32 v47, a254 v_accvgpr_read_b32 v48, a255 v_mfma_f32_32x32x8f16 a[144:159], v[26:27], v[11:12], a[144:159] v_mfma_f32_32x32x8f16 a[176:191], v[26:27], v[22:23], a[176:191] v_mfma_f32_32x32x8f16 a[96:111], v[30:31], v[11:12], a[96:111] v_mfma_f32_32x32x8f16 a[64:79], v[30:31], v[22:23], a[64:79] v_mfma_f32_32x32x8f16 a[16:31], v[51:52], v[11:12], a[16:31] v_mfma_f32_32x32x8f16 a[48:63], v[51:52], v[22:23], a[48:63] v_mfma_f32_32x32x8f16 a[128:143], v[26:27], v[7:8], a[128:143] v_mfma_f32_32x32x8f16 a[112:127], v[30:31], v[7:8], a[112:127] v_mfma_f32_32x32x8f16 a[0:15], v[51:52], v[7:8], a[0:15] v_mfma_f32_32x32x8f16 a[208:223], v[3:4], v[15:16], a[208:223] v_mfma_f32_32x32x8f16 a[80:95], v[30:31], v[15:16], a[80:95] v_mfma_f32_32x32x8f16 a[160:175], v[26:27], v[15:16], a[160:175] v_mfma_f32_32x32x8f16 a[32:47], v[51:52], v[15:16], a[32:47] s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_2 ; %bb.1: v_lshrrev_b32_e32 v1, 2, v0 v_mul_i32_i24_e32 v2, -4, v1 v_add_u32_e32 v1, v20, v1 v_lshlrev_b32_e32 v3, 1, v1 v_add_u32_e32 v4, s6, v17 v_lshl_add_u32 v3, v4, 8, v3 v_mul_lo_u32 v3, v3, s7 v_add_lshl_u32 v2, v2, v0, 4 v_lshlrev_b32_e32 v4, 12, v17 v_lshlrev_b32_e32 v1, 7, v1 v_add3_u32 v50, v2, v4, v1 v_add3_u32 v49, s26, v2, v3 .LBB3_2: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EEC2ERSO_RKNSA_IJiiiiEEES15_S1A_RKS3_.exit.i s_or_b64 exec, exec, s[0:1] v_lshrrev_b32_e32 v0, 3, v0 v_and_or_b32 v0, v0, 4, v18 v_lshlrev_b32_e32 v17, 5, v17 v_lshrrev_b32_e32 v18, 6, v19 v_add3_u32 v0, v0, v20, v17 v_sub_u32_e32 v0, v0, v18 v_lshlrev_b32_e32 v0, 6, v0 v_cvt_f16_f32_e32 v17, v33 v_add_lshl_u32 v51, v0, v19, 1 v_cvt_f16_f32_e32 v0, v34 v_cvt_f16_f32_e32 v18, v35 v_cvt_f16_f32_e32 v19, v36 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v17 ds_write_b16 v51, v0 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v0, v40 v_cvt_f16_f32_e32 v17, v39 v_cvt_f16_f32_e32 v18, v38 v_cvt_f16_f32_e32 v19, v37 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v0, v41 v_cvt_f16_f32_e32 v17, v42 v_cvt_f16_f32_e32 v18, v43 s_load_dword s2, s[4:5], 0x1b0 v_cvt_f16_f32_e32 v19, v44 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v0, v48 v_cvt_f16_f32_e32 v17, v47 v_accvgpr_read_b32 v1, a224 v_cvt_f16_f32_e32 v18, v46 v_accvgpr_read_b32 v2, a225 v_accvgpr_read_b32 v3, a226 v_accvgpr_read_b32 v4, a227 v_accvgpr_read_b32 v5, a228 v_accvgpr_read_b32 v6, a229 v_accvgpr_read_b32 v7, a230 v_accvgpr_read_b32 v8, a231 v_accvgpr_read_b32 v9, a232 v_accvgpr_read_b32 v10, a233 v_accvgpr_read_b32 v11, a234 v_accvgpr_read_b32 v12, a235 v_accvgpr_read_b32 v13, a236 v_accvgpr_read_b32 v14, a237 v_accvgpr_read_b32 v15, a238 v_accvgpr_read_b32 v16, a239 v_cvt_f16_f32_e32 v19, v45 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_4 ; %bb.3: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i.i.i.i v_lshlrev_b32_e32 v0, 1, v50 ds_read2_b64 v[17:20], v0 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v21, 1, v49 buffer_atomic_pk_add_f16 v17, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v20, v21, s[24:27], 12 offen ds_read2_b64 v[17:20], v0 offset0:2 offset1:3 v_add_u32_e32 v21, 8, v49 v_lshlrev_b32_e32 v22, 1, v21 v_add_lshl_u32 v21, v21, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v17, v22, s[24:27], 0 offen buffer_atomic_pk_add_f16 v18, v22, s[24:27], 4 offen buffer_atomic_pk_add_f16 v19, v22, s[24:27], 8 offen buffer_atomic_pk_add_f16 v20, v22, s[24:27], 12 offen ds_read2_b64 v[17:20], v0 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v17, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v20, v21, s[24:27], 12 offen ds_read2_b64 v[17:20], v0 offset0:16 offset1:17 v_add_lshl_u32 v0, v49, s7, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v17, v0, s[24:27], 0 offen buffer_atomic_pk_add_f16 v18, v0, s[24:27], 4 offen buffer_atomic_pk_add_f16 v19, v0, s[24:27], 8 offen buffer_atomic_pk_add_f16 v20, v0, s[24:27], 12 offen .LBB3_4: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v1 v_cvt_f16_f32_e32 v1, v2 v_cvt_f16_f32_e32 v2, v3 v_cvt_f16_f32_e32 v3, v4 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v7 v_cvt_f16_f32_e32 v2, v6 v_cvt_f16_f32_e32 v3, v5 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v9 v_cvt_f16_f32_e32 v1, v10 v_cvt_f16_f32_e32 v2, v11 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v16 v_cvt_f16_f32_e32 v1, v15 v_accvgpr_read_b32 v33, a208 v_cvt_f16_f32_e32 v2, v14 v_accvgpr_read_b32 v34, a209 v_accvgpr_read_b32 v35, a210 v_accvgpr_read_b32 v36, a211 v_accvgpr_read_b32 v37, a212 v_accvgpr_read_b32 v38, a213 v_accvgpr_read_b32 v39, a214 v_accvgpr_read_b32 v40, a215 v_accvgpr_read_b32 v41, a216 v_accvgpr_read_b32 v42, a217 v_accvgpr_read_b32 v43, a218 v_accvgpr_read_b32 v44, a219 v_accvgpr_read_b32 v45, a220 v_accvgpr_read_b32 v46, a221 v_accvgpr_read_b32 v47, a222 v_accvgpr_read_b32 v48, a223 v_cvt_f16_f32_e32 v3, v13 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_6 ; %bb.5: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i97.i.i.i.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s7, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[24:27], 12 offen .LBB3_6: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_122.i.i.i.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v33 v_cvt_f16_f32_e32 v1, v34 v_cvt_f16_f32_e32 v2, v35 v_cvt_f16_f32_e32 v3, v36 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v40 v_cvt_f16_f32_e32 v1, v39 v_cvt_f16_f32_e32 v2, v38 v_cvt_f16_f32_e32 v3, v37 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v41 v_cvt_f16_f32_e32 v1, v42 v_cvt_f16_f32_e32 v2, v43 v_cvt_f16_f32_e32 v3, v44 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v48 v_cvt_f16_f32_e32 v1, v47 v_accvgpr_read_b32 v16, a192 v_cvt_f16_f32_e32 v2, v46 v_accvgpr_read_b32 v17, a193 v_accvgpr_read_b32 v18, a194 v_accvgpr_read_b32 v19, a195 v_accvgpr_read_b32 v20, a196 v_accvgpr_read_b32 v21, a197 v_accvgpr_read_b32 v22, a198 v_accvgpr_read_b32 v23, a199 v_accvgpr_read_b32 v24, a200 v_accvgpr_read_b32 v25, a201 v_accvgpr_read_b32 v26, a202 v_accvgpr_read_b32 v27, a203 v_accvgpr_read_b32 v28, a204 v_accvgpr_read_b32 v29, a205 v_accvgpr_read_b32 v30, a206 v_accvgpr_read_b32 v31, a207 v_cvt_f16_f32_e32 v3, v45 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_8 ; %bb.7: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i219.i.i.i.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s7, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[24:27], 12 offen .LBB3_8: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_244.i.i.i.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a176 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a177 v_accvgpr_read_b32 v2, a178 v_accvgpr_read_b32 v3, a179 v_accvgpr_read_b32 v4, a180 v_accvgpr_read_b32 v5, a181 v_accvgpr_read_b32 v6, a182 v_accvgpr_read_b32 v7, a183 v_accvgpr_read_b32 v8, a184 v_accvgpr_read_b32 v9, a185 v_accvgpr_read_b32 v10, a186 v_accvgpr_read_b32 v11, a187 v_accvgpr_read_b32 v12, a188 v_accvgpr_read_b32 v13, a189 v_accvgpr_read_b32 v14, a190 v_accvgpr_read_b32 v15, a191 v_cvt_f16_f32_e32 v19, v28 s_mul_i32 s3, s7, 63 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_10 ; %bb.9: ; %_ZNK2ck10static_forILi0ELi4ELi1EEclIZZNS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS6_IJiiiEEELb0EEENS7_INS6_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESF_NS_23Merge_v2_magic_divisionINS6_IJiiEEEEESF_NSE_ISB_EENS7_ISH_Lb0EEESF_SJ_EEENS6_IJNS_8SequenceIJLi0EEEENSM_IJLi1EEEENSM_IJLi2EEEENSM_IJLi3EEEENSM_IJLi4ELi6EEEENSM_IJLi7EEEENSM_IJLi5EEEENSM_IJLi8EEEENSM_IJLi9EEEENSM_IJLi10EEEEEEENS6_IJNSM_IJLi1ELi2ELi3EEEENSM_IJLi4ELi5EEEENSM_IJLi6EEEESS_SU_SV_SW_NSM_IJLi11ELi12EEEENSM_IJLi13EEEENSM_IJLi14EEEEEEENSM_IJLi11ELi12ELi13ELi14EEEEiEENS5_INS6_IJNS7_INS6_IJiiiiEEELb0EEESF_NS_3PadIiiiLb0EEES1A_SF_SF_NS_5EmbedISH_SH_Lb0EEES1C_SF_SD_SF_SF_SF_SF_SF_NSG_IS8_EES1D_SJ_SK_SF_SJ_EEENS6_IJSN_SO_SP_SQ_NSM_IJLi4EEEEST_S10_SS_SU_SV_SW_NSM_IJLi11EEEENSM_IJLi12EEEES12_S13_NSM_IJLi15ELi18ELi20EEEENSM_IJLi17ELi19ELi21EEEENSM_IJLi16EEEENSM_IJLi22EEEENSM_IJLi23EEEENSM_IJLi24EEEEEEENS6_IJNSM_IJLi1ELi2ELi3ELi4EEEEST_.i.i.i.i v_lshlrev_b32_e32 v20, 1, v50 ds_read2_b64 v[16:19], v20 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v21, 1, v49 buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:2 offset1:3 v_add_u32_e32 v21, 8, v49 v_lshlrev_b32_e32 v22, 1, v21 v_add_lshl_u32 v21, v21, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v22, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v22, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v22, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v22, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:16 offset1:17 v_add_u32_e32 v20, s7, v49 v_lshlrev_b32_e32 v21, 1, v20 v_add_u32_e32 v49, s3, v20 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen .LBB3_10: ; %_ZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_I.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a160 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a161 v_accvgpr_read_b32 v18, a162 v_accvgpr_read_b32 v19, a163 v_accvgpr_read_b32 v20, a164 v_accvgpr_read_b32 v21, a165 v_accvgpr_read_b32 v22, a166 v_accvgpr_read_b32 v23, a167 v_accvgpr_read_b32 v24, a168 v_accvgpr_read_b32 v25, a169 v_accvgpr_read_b32 v26, a170 v_accvgpr_read_b32 v27, a171 v_accvgpr_read_b32 v28, a172 v_accvgpr_read_b32 v29, a173 v_accvgpr_read_b32 v30, a174 v_accvgpr_read_b32 v31, a175 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_12 ; %bb.11: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i108.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s7, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[24:27], 12 offen .LBB3_12: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i156.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a144 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a145 v_accvgpr_read_b32 v2, a146 v_accvgpr_read_b32 v3, a147 v_accvgpr_read_b32 v4, a148 v_accvgpr_read_b32 v5, a149 v_accvgpr_read_b32 v6, a150 v_accvgpr_read_b32 v7, a151 v_accvgpr_read_b32 v8, a152 v_accvgpr_read_b32 v9, a153 v_accvgpr_read_b32 v10, a154 v_accvgpr_read_b32 v11, a155 v_accvgpr_read_b32 v12, a156 v_accvgpr_read_b32 v13, a157 v_accvgpr_read_b32 v14, a158 v_accvgpr_read_b32 v15, a159 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_14 ; %bb.13: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i97.i.i.i224.i.i.i v_lshlrev_b32_e32 v20, 1, v50 ds_read2_b64 v[16:19], v20 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v21, 1, v49 buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:2 offset1:3 v_add_u32_e32 v21, 8, v49 v_lshlrev_b32_e32 v22, 1, v21 v_add_lshl_u32 v21, v21, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v22, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v22, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v22, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v22, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:16 offset1:17 v_add_lshl_u32 v20, v49, s7, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v20, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v20, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v20, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v20, s[24:27], 12 offen .LBB3_14: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_122.i.i.i272.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a128 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a129 v_accvgpr_read_b32 v18, a130 v_accvgpr_read_b32 v19, a131 v_accvgpr_read_b32 v20, a132 v_accvgpr_read_b32 v21, a133 v_accvgpr_read_b32 v22, a134 v_accvgpr_read_b32 v23, a135 v_accvgpr_read_b32 v24, a136 v_accvgpr_read_b32 v25, a137 v_accvgpr_read_b32 v26, a138 v_accvgpr_read_b32 v27, a139 v_accvgpr_read_b32 v28, a140 v_accvgpr_read_b32 v29, a141 v_accvgpr_read_b32 v30, a142 v_accvgpr_read_b32 v31, a143 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_16 ; %bb.15: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i219.i.i.i340.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s7, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[24:27], 12 offen .LBB3_16: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_244.i.i.i388.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a112 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a113 v_accvgpr_read_b32 v2, a114 v_accvgpr_read_b32 v3, a115 v_accvgpr_read_b32 v4, a116 v_accvgpr_read_b32 v5, a117 v_accvgpr_read_b32 v6, a118 v_accvgpr_read_b32 v7, a119 v_accvgpr_read_b32 v8, a120 v_accvgpr_read_b32 v9, a121 v_accvgpr_read_b32 v10, a122 v_accvgpr_read_b32 v11, a123 v_accvgpr_read_b32 v12, a124 v_accvgpr_read_b32 v13, a125 v_accvgpr_read_b32 v14, a126 v_accvgpr_read_b32 v15, a127 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_18 ; %bb.17: ; %_ZNK2ck10static_forILi0ELi4ELi1EEclIZZNS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS6_IJiiiEEELb0EEENS7_INS6_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESF_NS_23Merge_v2_magic_divisionINS6_IJiiEEEEESF_NSE_ISB_EENS7_ISH_Lb0EEESF_SJ_EEENS6_IJNS_8SequenceIJLi0EEEENSM_IJLi1EEEENSM_IJLi2EEEENSM_IJLi3EEEENSM_IJLi4ELi6EEEENSM_IJLi7EEEENSM_IJLi5EEEENSM_IJLi8EEEENSM_IJLi9EEEENSM_IJLi10EEEEEEENS6_IJNSM_IJLi1ELi2ELi3EEEENSM_IJLi4ELi5EEEENSM_IJLi6EEEESS_SU_SV_SW_NSM_IJLi11ELi12EEEENSM_IJLi13EEEENSM_IJLi14EEEEEEENSM_IJLi11ELi12ELi13ELi14EEEEiEENS5_INS6_IJNS7_INS6_IJiiiiEEELb0EEESF_NS_3PadIiiiLb0EEES1A_SF_SF_NS_5EmbedISH_SH_Lb0EEES1C_SF_SD_SF_SF_SF_SF_SF_NSG_IS8_EES1D_SJ_SK_SF_SJ_EEENS6_IJSN_SO_SP_SQ_NSM_IJLi4EEEEST_S10_SS_SU_SV_SW_NSM_IJLi11EEEENSM_IJLi12EEEES12_S13_NSM_IJLi15ELi18ELi20EEEENSM_IJLi17ELi19ELi21EEEENSM_IJLi16EEEENSM_IJLi22EEEENSM_IJLi23EEEENSM_IJLi24EEEEEEENS6_IJNSM_IJLi1ELi2ELi3ELi4EEEEST_.i444.i.i.i v_lshlrev_b32_e32 v20, 1, v50 ds_read2_b64 v[16:19], v20 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v21, 1, v49 buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:2 offset1:3 v_add_u32_e32 v21, 8, v49 v_lshlrev_b32_e32 v22, 1, v21 v_add_lshl_u32 v21, v21, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v22, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v22, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v22, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v22, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:16 offset1:17 v_add_u32_e32 v20, s7, v49 v_lshlrev_b32_e32 v21, 1, v20 v_add_u32_e32 v49, s3, v20 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen .LBB3_18: ; %_ZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_I469.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a96 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a97 v_accvgpr_read_b32 v18, a98 v_accvgpr_read_b32 v19, a99 v_accvgpr_read_b32 v20, a100 v_accvgpr_read_b32 v21, a101 v_accvgpr_read_b32 v22, a102 v_accvgpr_read_b32 v23, a103 v_accvgpr_read_b32 v24, a104 v_accvgpr_read_b32 v25, a105 v_accvgpr_read_b32 v26, a106 v_accvgpr_read_b32 v27, a107 v_accvgpr_read_b32 v28, a108 v_accvgpr_read_b32 v29, a109 v_accvgpr_read_b32 v30, a110 v_accvgpr_read_b32 v31, a111 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_20 ; %bb.19: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i577.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s7, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[24:27], 12 offen .LBB3_20: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i625.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a80 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a81 v_accvgpr_read_b32 v2, a82 v_accvgpr_read_b32 v3, a83 v_accvgpr_read_b32 v4, a84 v_accvgpr_read_b32 v5, a85 v_accvgpr_read_b32 v6, a86 v_accvgpr_read_b32 v7, a87 v_accvgpr_read_b32 v8, a88 v_accvgpr_read_b32 v9, a89 v_accvgpr_read_b32 v10, a90 v_accvgpr_read_b32 v11, a91 v_accvgpr_read_b32 v12, a92 v_accvgpr_read_b32 v13, a93 v_accvgpr_read_b32 v14, a94 v_accvgpr_read_b32 v15, a95 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_22 ; %bb.21: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i97.i.i.i693.i.i.i v_lshlrev_b32_e32 v20, 1, v50 ds_read2_b64 v[16:19], v20 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v21, 1, v49 buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:2 offset1:3 v_add_u32_e32 v21, 8, v49 v_lshlrev_b32_e32 v22, 1, v21 v_add_lshl_u32 v21, v21, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v22, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v22, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v22, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v22, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:16 offset1:17 v_add_lshl_u32 v20, v49, s7, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v20, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v20, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v20, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v20, s[24:27], 12 offen .LBB3_22: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_122.i.i.i741.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a64 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a65 v_accvgpr_read_b32 v18, a66 v_accvgpr_read_b32 v19, a67 v_accvgpr_read_b32 v20, a68 v_accvgpr_read_b32 v21, a69 v_accvgpr_read_b32 v22, a70 v_accvgpr_read_b32 v23, a71 v_accvgpr_read_b32 v24, a72 v_accvgpr_read_b32 v25, a73 v_accvgpr_read_b32 v26, a74 v_accvgpr_read_b32 v27, a75 v_accvgpr_read_b32 v28, a76 v_accvgpr_read_b32 v29, a77 v_accvgpr_read_b32 v30, a78 v_accvgpr_read_b32 v31, a79 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_24 ; %bb.23: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i219.i.i.i809.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s7, 1 v_add_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[24:27], 12 offen .LBB3_24: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_244.i.i.i857.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a48 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a49 v_accvgpr_read_b32 v2, a50 v_accvgpr_read_b32 v3, a51 v_accvgpr_read_b32 v4, a52 v_accvgpr_read_b32 v5, a53 v_accvgpr_read_b32 v6, a54 v_accvgpr_read_b32 v7, a55 v_accvgpr_read_b32 v8, a56 v_accvgpr_read_b32 v9, a57 v_accvgpr_read_b32 v10, a58 v_accvgpr_read_b32 v11, a59 v_accvgpr_read_b32 v12, a60 v_accvgpr_read_b32 v13, a61 v_accvgpr_read_b32 v14, a62 v_accvgpr_read_b32 v15, a63 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_26 ; %bb.25: ; %_ZNK2ck10static_forILi0ELi4ELi1EEclIZZNS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS6_IJiiiEEELb0EEENS7_INS6_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESF_NS_23Merge_v2_magic_divisionINS6_IJiiEEEEESF_NSE_ISB_EENS7_ISH_Lb0EEESF_SJ_EEENS6_IJNS_8SequenceIJLi0EEEENSM_IJLi1EEEENSM_IJLi2EEEENSM_IJLi3EEEENSM_IJLi4ELi6EEEENSM_IJLi7EEEENSM_IJLi5EEEENSM_IJLi8EEEENSM_IJLi9EEEENSM_IJLi10EEEEEEENS6_IJNSM_IJLi1ELi2ELi3EEEENSM_IJLi4ELi5EEEENSM_IJLi6EEEESS_SU_SV_SW_NSM_IJLi11ELi12EEEENSM_IJLi13EEEENSM_IJLi14EEEEEEENSM_IJLi11ELi12ELi13ELi14EEEEiEENS5_INS6_IJNS7_INS6_IJiiiiEEELb0EEESF_NS_3PadIiiiLb0EEES1A_SF_SF_NS_5EmbedISH_SH_Lb0EEES1C_SF_SD_SF_SF_SF_SF_SF_NSG_IS8_EES1D_SJ_SK_SF_SJ_EEENS6_IJSN_SO_SP_SQ_NSM_IJLi4EEEEST_S10_SS_SU_SV_SW_NSM_IJLi11EEEENSM_IJLi12EEEES12_S13_NSM_IJLi15ELi18ELi20EEEENSM_IJLi17ELi19ELi21EEEENSM_IJLi16EEEENSM_IJLi22EEEENSM_IJLi23EEEENSM_IJLi24EEEEEEENS6_IJNSM_IJLi1ELi2ELi3ELi4EEEEST_.i913.i.i.i v_lshlrev_b32_e32 v20, 1, v50 ds_read2_b64 v[16:19], v20 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v21, 1, v49 buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:2 offset1:3 v_add_u32_e32 v21, 8, v49 v_lshlrev_b32_e32 v22, 1, v21 v_add_lshl_u32 v21, v21, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v22, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v22, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v22, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v22, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:16 offset1:17 v_add_u32_e32 v20, s7, v49 v_lshlrev_b32_e32 v21, 1, v20 v_add_u32_e32 v49, s3, v20 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen .LBB3_26: ; %_ZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_I938.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v16, a32 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v17, a33 v_accvgpr_read_b32 v18, a34 v_accvgpr_read_b32 v19, a35 v_accvgpr_read_b32 v20, a36 v_accvgpr_read_b32 v21, a37 v_accvgpr_read_b32 v22, a38 v_accvgpr_read_b32 v23, a39 v_accvgpr_read_b32 v24, a40 v_accvgpr_read_b32 v25, a41 v_accvgpr_read_b32 v26, a42 v_accvgpr_read_b32 v27, a43 v_accvgpr_read_b32 v28, a44 v_accvgpr_read_b32 v29, a45 v_accvgpr_read_b32 v30, a46 v_accvgpr_read_b32 v31, a47 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_28 ; %bb.27: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i.i.i.i1046.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s7, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[24:27], 12 offen .LBB3_28: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_.i.i.i1094.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v16, v16 v_cvt_f16_f32_e32 v17, v17 v_cvt_f16_f32_e32 v18, v18 v_cvt_f16_f32_e32 v19, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v16 ds_write_b16 v51, v17 offset:128 ds_write_b16 v51, v18 offset:256 ds_write_b16 v51, v19 offset:384 v_cvt_f16_f32_e32 v16, v23 v_cvt_f16_f32_e32 v17, v22 v_cvt_f16_f32_e32 v18, v21 v_cvt_f16_f32_e32 v19, v20 ds_write_b16 v51, v16 offset:1408 ds_write_b16 v51, v17 offset:1280 ds_write_b16 v51, v18 offset:1152 ds_write_b16 v51, v19 offset:1024 v_cvt_f16_f32_e32 v16, v24 v_cvt_f16_f32_e32 v17, v25 v_cvt_f16_f32_e32 v18, v26 v_cvt_f16_f32_e32 v19, v27 ds_write_b16 v51, v16 offset:2048 ds_write_b16 v51, v17 offset:2176 ds_write_b16 v51, v18 offset:2304 ds_write_b16 v51, v19 offset:2432 v_cvt_f16_f32_e32 v16, v31 v_cvt_f16_f32_e32 v17, v30 v_accvgpr_read_b32 v0, a16 v_cvt_f16_f32_e32 v18, v29 v_accvgpr_read_b32 v1, a17 v_accvgpr_read_b32 v2, a18 v_accvgpr_read_b32 v3, a19 v_accvgpr_read_b32 v4, a20 v_accvgpr_read_b32 v5, a21 v_accvgpr_read_b32 v6, a22 v_accvgpr_read_b32 v7, a23 v_accvgpr_read_b32 v8, a24 v_accvgpr_read_b32 v9, a25 v_accvgpr_read_b32 v10, a26 v_accvgpr_read_b32 v11, a27 v_accvgpr_read_b32 v12, a28 v_accvgpr_read_b32 v13, a29 v_accvgpr_read_b32 v14, a30 v_accvgpr_read_b32 v15, a31 v_cvt_f16_f32_e32 v19, v28 ds_write_b16 v51, v16 offset:3456 ds_write_b16 v51, v17 offset:3328 ds_write_b16 v51, v18 offset:3200 ds_write_b16 v51, v19 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_30 ; %bb.29: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i97.i.i.i1162.i.i.i v_lshlrev_b32_e32 v20, 1, v50 ds_read2_b64 v[16:19], v20 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v21, 1, v49 buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:2 offset1:3 v_add_u32_e32 v21, 8, v49 v_lshlrev_b32_e32 v22, 1, v21 v_add_lshl_u32 v21, v21, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v22, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v22, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v22, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v22, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v21, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v21, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v21, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v21, s[24:27], 12 offen ds_read2_b64 v[16:19], v20 offset0:16 offset1:17 v_add_lshl_u32 v20, v49, s7, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v16, v20, s[24:27], 0 offen buffer_atomic_pk_add_f16 v17, v20, s[24:27], 4 offen buffer_atomic_pk_add_f16 v18, v20, s[24:27], 8 offen buffer_atomic_pk_add_f16 v19, v20, s[24:27], 12 offen .LBB3_30: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_122.i.i.i1210.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v0 v_cvt_f16_f32_e32 v1, v1 v_cvt_f16_f32_e32 v2, v2 v_cvt_f16_f32_e32 v3, v3 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v7 v_cvt_f16_f32_e32 v1, v6 v_cvt_f16_f32_e32 v2, v5 v_cvt_f16_f32_e32 v3, v4 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v8 v_cvt_f16_f32_e32 v1, v9 v_cvt_f16_f32_e32 v2, v10 v_cvt_f16_f32_e32 v3, v11 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v15 v_cvt_f16_f32_e32 v1, v14 v_accvgpr_read_b32 v31, a15 v_cvt_f16_f32_e32 v2, v13 v_accvgpr_read_b32 v30, a14 v_accvgpr_read_b32 v29, a13 v_accvgpr_read_b32 v28, a12 v_accvgpr_read_b32 v27, a11 v_accvgpr_read_b32 v26, a10 v_accvgpr_read_b32 v25, a9 v_accvgpr_read_b32 v24, a8 v_accvgpr_read_b32 v23, a7 v_accvgpr_read_b32 v22, a6 v_accvgpr_read_b32 v21, a5 v_accvgpr_read_b32 v20, a4 v_accvgpr_read_b32 v19, a3 v_accvgpr_read_b32 v18, a2 v_accvgpr_read_b32 v17, a1 v_accvgpr_read_b32 v16, a0 v_cvt_f16_f32_e32 v3, v12 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_32 ; %bb.31: ; %_ZN2ck33BlockwiseTensorSliceTransfer_v6r1ILi256ENS_16tensor_operation12element_wise11PassThroughELNS_25InMemoryDataOperationEnumE1ENS_8SequenceIJLi1ELi64ELi1ELi64EEEENS5_IJLi1ELi32ELi1ELi4EEEENS5_IJLi0ELi1ELi2ELi3EEEEDF16_DF16_KNS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINSA_IJNS_17integral_constantIiLi1EEENSC_IiLi64EEESD_SE_EEELb0EEEEEENSA_IJNS5_IJLi0EEEEEEENSA_IJNS5_IJLi1ELi2ELi3ELi4EEEEEEESK_NSC_IiLi4096EEEEERKNS9_INSA_IJNSB_INSA_IJiiEEELb0EEENSB_INSA_IJiNSC_IiLi256EEEEEELb0EEEST_EEENSA_IJSI_NS5_IJLi1EEEENS5_IJLi2EEEEEEENSA_IJNS5_IJLi1ELi2EEEENS5_IJLi3ELi4EEEENS5_IJLi5ELi6EEEEEEENS5_IJLi3ELi4ELi5ELi6EEEEiEES8_Li3ELi8ELb1ELb0EE3RunINS_13DynamicBufferILNS_16AddressSpaceEnumE2EDF16_SM_Lb1EEENS18_ILS19_1EDF16_iLb1EEEEEvRSO_RKT_S15_RT0_.exit.i219.i.i.i1278.i.i.i v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s7, 1 v_subrev_u32_e32 v49, 64, v49 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[24:27], 12 offen .LBB3_32: ; %_ZZZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_244.i.i.i1326.i.i.i s_or_b64 exec, exec, s[0:1] v_cvt_f16_f32_e32 v0, v16 v_cvt_f16_f32_e32 v1, v17 v_cvt_f16_f32_e32 v2, v18 v_cvt_f16_f32_e32 v3, v19 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND ds_write_b16 v51, v0 ds_write_b16 v51, v1 offset:128 ds_write_b16 v51, v2 offset:256 ds_write_b16 v51, v3 offset:384 v_cvt_f16_f32_e32 v0, v23 v_cvt_f16_f32_e32 v1, v22 v_cvt_f16_f32_e32 v2, v21 v_cvt_f16_f32_e32 v3, v20 ds_write_b16 v51, v0 offset:1408 ds_write_b16 v51, v1 offset:1280 ds_write_b16 v51, v2 offset:1152 ds_write_b16 v51, v3 offset:1024 v_cvt_f16_f32_e32 v0, v24 v_cvt_f16_f32_e32 v1, v25 v_cvt_f16_f32_e32 v2, v26 v_cvt_f16_f32_e32 v3, v27 ds_write_b16 v51, v0 offset:2048 ds_write_b16 v51, v1 offset:2176 ds_write_b16 v51, v2 offset:2304 ds_write_b16 v51, v3 offset:2432 v_cvt_f16_f32_e32 v0, v31 v_cvt_f16_f32_e32 v1, v30 v_cvt_f16_f32_e32 v2, v29 v_cvt_f16_f32_e32 v3, v28 ds_write_b16 v51, v0 offset:3456 ds_write_b16 v51, v1 offset:3328 ds_write_b16 v51, v2 offset:3200 ds_write_b16 v51, v3 offset:3072 ;;#ASMSTART s_waitcnt lgkmcnt(0) s_barrier ;;#ASMEND s_and_saveexec_b64 s[0:1], vcc s_cbranch_execz .LBB3_34 ; %bb.33: v_lshlrev_b32_e32 v4, 1, v50 ds_read2_b64 v[0:3], v4 offset1:1 s_waitcnt lgkmcnt(0) s_lshl_b32 s26, s2, 1 s_mov_b32 s27, 0x20000 v_lshlrev_b32_e32 v5, 1, v49 buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 v_add_u32_e32 v5, 8, v49 v_lshlrev_b32_e32 v6, 1, v5 v_add_lshl_u32 v5, v5, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v6, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v6, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v6, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v6, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:18 offset1:19 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v5, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v5, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v5, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v5, s[24:27], 12 offen ds_read2_b64 v[0:3], v4 offset0:16 offset1:17 v_add_lshl_u32 v4, v49, s7, 1 s_waitcnt lgkmcnt(0) buffer_atomic_pk_add_f16 v0, v4, s[24:27], 0 offen buffer_atomic_pk_add_f16 v1, v4, s[24:27], 4 offen buffer_atomic_pk_add_f16 v2, v4, s[24:27], 8 offen buffer_atomic_pk_add_f16 v3, v4, s[24:27], 12 offen .LBB3_34: ; %_ZN2ck43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS3_IJiiiEEELb0EEENS4_INS3_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESC_NS_23Merge_v2_magic_divisionINS3_IJiiEEEEESC_NSB_IS8_EENS4_ISE_Lb0EEESC_SG_EEENS3_IJNS_8SequenceIJLi0EEEENSJ_IJLi1EEEENSJ_IJLi2EEEENSJ_IJLi3EEEENSJ_IJLi4ELi6EEEENSJ_IJLi7EEEENSJ_IJLi5EEEENSJ_IJLi8EEEENSJ_IJLi9EEEENSJ_IJLi10EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3EEEENSJ_IJLi4ELi5EEEENSJ_IJLi6EEEESP_SR_SS_ST_NSJ_IJLi11ELi12EEEENSJ_IJLi13EEEENSJ_IJLi14EEEEEEENSJ_IJLi11ELi12ELi13ELi14EEEEiEENS2_INS3_IJNS4_INS3_IJiiiiEEELb0EEESC_NS_3PadIiiiLb0EEES17_SC_SC_NS_5EmbedISE_SE_Lb0EEES19_SC_SA_SC_SC_SC_SC_SC_NSD_IS5_EES1A_SG_SH_SC_SG_EEENS3_IJSK_SL_SM_SN_NSJ_IJLi4EEEESQ_SX_SP_SR_SS_ST_NSJ_IJLi11EEEENSJ_IJLi12EEEESZ_S10_NSJ_IJLi15ELi18ELi20EEEENSJ_IJLi17ELi19ELi21EEEENSJ_IJLi16EEEENSJ_IJLi22EEEENSJ_IJLi23EEEENSJ_IJLi24EEEEEEENS3_IJNSJ_IJLi1ELi2ELi3ELi4EEEESQ_SX_SP_SR_SS_NSJ_IJLi10ELi11EEEENSJ_IJ s_endpgm .section .rodata,#alloc .p2align 6 .amdhsa_kernel _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .amdhsa_group_segment_fixed_size 34816 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 544 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 256 .amdhsa_next_free_sgpr 59 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 1 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .section .text._ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_,#alloc,#execinstr .Lfunc_end3: .size _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_, .Lfunc_end3-_ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 12612 ; NumSgprs: 61 ; NumVgprs: 53 ; NumAgprs: 256 ; TotalNumVgprs: 256 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 34816 bytes/workgroup (compile time only) ; SGPRBlocks: 7 ; VGPRBlocks: 63 ; NumSGPRsForWavesPerEU: 61 ; NumVGPRsForWavesPerEU: 256 ; Occupancy: 1 ; WaveLimiterHint : 0 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 .ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.1.0 22114 5cba46feb6af367b1cafaa183ec42dbfb8207b14)" .section ".note.GNU-stack" .addrsig .amdgpu_metadata --- amdhsa.kernels: - .args: - .address_space: global .offset: 0 .size: 8 .value_kind: global_buffer - .address_space: global .offset: 8 .size: 8 .value_kind: global_buffer - .address_space: global .offset: 16 .size: 8 .value_kind: global_buffer - .offset: 24 .size: 112 .value_kind: by_value - .offset: 136 .size: 252 .value_kind: by_value - .offset: 388 .size: 48 .value_kind: by_value - .offset: 436 .size: 1 .value_kind: by_value - .offset: 437 .size: 1 .value_kind: by_value - .offset: 438 .size: 1 .value_kind: by_value - .offset: 440 .size: 104 .value_kind: by_value .group_segment_fixed_size: 34816 .kernarg_segment_align: 8 .kernarg_segment_size: 544 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 256 .name: _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .private_segment_fixed_size: 0 .sgpr_count: 50 .sgpr_spill_count: 0 .symbol: _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_.kd .vgpr_count: 256 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .address_space: global .offset: 0 .size: 8 .value_kind: global_buffer - .address_space: global .offset: 8 .size: 8 .value_kind: global_buffer - .address_space: global .offset: 16 .size: 8 .value_kind: global_buffer - .offset: 24 .size: 112 .value_kind: by_value - .offset: 136 .size: 252 .value_kind: by_value - .offset: 388 .size: 48 .value_kind: by_value - .offset: 436 .size: 1 .value_kind: by_value - .offset: 437 .size: 1 .value_kind: by_value - .offset: 438 .size: 1 .value_kind: by_value - .offset: 440 .size: 104 .value_kind: by_value .group_segment_fixed_size: 34816 .kernarg_segment_align: 8 .kernarg_segment_size: 544 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 256 .name: _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .private_segment_fixed_size: 0 .sgpr_count: 50 .sgpr_spill_count: 0 .symbol: _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb1EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_.kd .vgpr_count: 256 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .address_space: global .offset: 0 .size: 8 .value_kind: global_buffer - .address_space: global .offset: 8 .size: 8 .value_kind: global_buffer - .address_space: global .offset: 16 .size: 8 .value_kind: global_buffer - .offset: 24 .size: 112 .value_kind: by_value - .offset: 136 .size: 252 .value_kind: by_value - .offset: 388 .size: 48 .value_kind: by_value - .offset: 436 .size: 1 .value_kind: by_value - .offset: 437 .size: 1 .value_kind: by_value - .offset: 438 .size: 1 .value_kind: by_value - .offset: 440 .size: 104 .value_kind: by_value .group_segment_fixed_size: 34816 .kernarg_segment_align: 8 .kernarg_segment_size: 544 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 256 .name: _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .private_segment_fixed_size: 0 .sgpr_count: 61 .sgpr_spill_count: 0 .symbol: _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE0ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_.kd .vgpr_count: 256 .vgpr_spill_count: 0 .wavefront_size: 64 - .args: - .address_space: global .offset: 0 .size: 8 .value_kind: global_buffer - .address_space: global .offset: 8 .size: 8 .value_kind: global_buffer - .address_space: global .offset: 16 .size: 8 .value_kind: global_buffer - .offset: 24 .size: 112 .value_kind: by_value - .offset: 136 .size: 252 .value_kind: by_value - .offset: 388 .size: 48 .value_kind: by_value - .offset: 436 .size: 1 .value_kind: by_value - .offset: 437 .size: 1 .value_kind: by_value - .offset: 438 .size: 1 .value_kind: by_value - .offset: 440 .size: 104 .value_kind: by_value .group_segment_fixed_size: 34816 .kernarg_segment_align: 8 .kernarg_segment_size: 544 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 256 .name: _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_ .private_segment_fixed_size: 0 .sgpr_count: 61 .sgpr_spill_count: 0 .symbol: _ZN2ck25kernel_gemm_xdlops_v2r4r2INS_43GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2ILi256EDF16_fDF16_LNS_25InMemoryDataOperationEnumE1ENS_16TensorDescriptorINS_5TupleIJNS_7UnMergeINS4_IJiiiEEELb0EEENS5_INS4_IJiNS_17integral_constantIiLi8EEEEEELb0EEENS_11PassThroughIiEESD_NS_23Merge_v2_magic_divisionINS4_IJiiEEEEESD_NSC_IS9_EENS5_ISF_Lb0EEESD_SH_EEENS4_IJNS_8SequenceIJLi0EEEENSK_IJLi1EEEENSK_IJLi2EEEENSK_IJLi3EEEENSK_IJLi4ELi6EEEENSK_IJLi7EEEENSK_IJLi5EEEENSK_IJLi8EEEENSK_IJLi9EEEENSK_IJLi10EEEEEEENS4_IJNSK_IJLi1ELi2ELi3EEEENSK_IJLi4ELi5EEEENSK_IJLi6EEEESQ_SS_ST_SU_NSK_IJLi11ELi12EEEENSK_IJLi13EEEENSK_IJLi14EEEEEEENSK_IJLi11ELi12ELi13ELi14EEEEiEENS3_INS4_IJNS5_INS4_IJiiiiEEELb0EEESD_NS_3PadIiiiLb0EEES18_SD_SD_NS_5EmbedISF_SF_Lb0EEES1A_SD_SB_SD_SD_SD_SD_SD_NSE_IS6_EES1B_SH_SI_SD_SH_EEENS4_IJSL_SM_SN_SO_NSK_IJLi4EEEESR_SY_SQ_SS_ST_SU_NSK_IJLi11EEEENSK_IJLi12EEEES10_S11_NSK_IJLi15ELi18ELi20EEEENSK_IJLi17ELi19ELi21EEEENSK_IJLi16EEEENSK_IJLi22EEEENSK_IJLi23EEEENSK_IJLi24EEEEEEENS4_IJNSK_IJLi1ELi2ELi3ELi4EEEESR_SY_SQ_SS_ST_NSK_IJLi10ELi11EEEENSK_IJLi12ELi13EEEES11_NSK_IJLi15ELi16EEEENSK_IJLi17EEEENSK_IJLi18EEEENSK_IJLi19EEEENSK_IJLi20EEEENSK_IJLi21EEEES1J_S1K_S1L_NSK_IJLi25ELi26EEEENSK_IJLi27EEEENSK_IJLi28EEEEEEENSK_IJLi25ELi26ELi27ELi28EEEEiEENS3_INS4_IJSI_EEENS4_IJSL_EEENS4_IJNSK_IJLi1ELi2EEEEEEES24_iEENS_16tensor_operation12element_wise11PassThroughES29_S29_Li256ELi256ELi4ELi32ELi32ELi8ELi4ELi4ENSK_IJLi1ELi4ELi32ELi2EEEENSK_IJLi0ELi3ELi1ELi2EEEENSK_IJLi0ELi2ELi1ELi3EEEELi2ELi8ELi4ELb0ELb1ES2A_S2B_S2C_Li2ELi8ELi4ELb0ELb1ELi1ELi1ELi8ENSK_IJLi1ELi32ELi1ELi4EEEELb1ELb1EEEDF16_DF16_S14_S21_NS3_INS4_IJSI_NS5_INS4_IJiNS8_IiLi256EEEEEELb0EEES2H_EEENS4_IJSL_SM_SN_EEENS4_IJS24_NSK_IJLi3ELi4EEEENSK_IJLi5ELi6EEEEEEENSK_IJLi3ELi4ELi5ELi6EEEEiEES29_S29_S29_NS_13TensorAdaptorINS4_IJSD_SI_SI_NSE_INS4_IJiiiiiEEEEEEEENS4_IJSL_SM_SN_NSK_IJLi3ELi4ELi5ELi6ELi7EEEEEEENS4_IJSO_SP_NSK_IJLi5ELi7EEEESS_EEENSK_IJLi0ELi1ELi2EEEESS_EELb0EEEvPKT0_S31_PT1_T2_T3_T4_T5_T6_T7_T8_.kd .vgpr_count: 256 .vgpr_spill_count: 0 .wavefront_size: 64 amdhsa.target: amdgcn-amd-amdhsa--gfx908 amdhsa.version: - 1 - 1 ... .end_amdgpu_metadata