删除子模块的gitignore

c454d419 · lisj · 3359c1f1 · c454d419 · c454d419 · c454d419
Commit c454d419 authored May 12, 2023 by lisj
4 changed files
--- a/third_party/libxsmm/src/libxsmm_spmdm_begin.h
+++ b/third_party/libxsmm/src/libxsmm_spmdm_begin.h
+/******************************************************************************
+* Copyright (c) Intel Corporation - All rights reserved.                      *
+* This file is part of the LIBXSMM library.                                   *
+*                                                                             *
+* For information on the license, see the LICENSE file.                       *
+* Further information: https://github.com/hfp/libxsmm/                        *
+* SPDX-License-Identifier: BSD-3-Clause                                       *
+******************************************************************************/
+/* Nadathur Satish, Hans Pabst (Intel Corp.)
+******************************************************************************/
+
+#define SIMD_WIDTH_FP32 (1)
+#define SIMDTYPE_FP32 float
+#define SIMDTYPE_INT32 int
+#define SIMDMASKTYPE_FP32 int
+#define _MM_SETZERO_FP32() (0)
+#define _MM_SETZERO_INT32() (0)
+#define _MM_SET1_FP32(x) (x)
+#define _MM_SET1_INT32(x) (x)
+#define _MM_SET1_INT16 (x)
+#define _MM_LOAD_FP32(x) (*(x))
+#define _MM_LOADU_FP32(x) (*(x))
+#define _MM_LOAD_INT32(x) (*(x))
+#define _MM_STORE_INT32(x,y) ((*(x)) = (y))
+#define _MM_LOADU_INT32(x) (*(x))
+#define _MM_GATHER_FP32(Addr, idx, scale) (*(Addr + (idx)))
+#define _MM_CMPNEQ_FP32(v1,v2) (LIBXSMM_FEQ(v1, v2) ? 0 : 1)
+#define _MM_STORE_FP32(x,y) ((*(x)) = (y))
+#define _MM_STOREU_FP32(x,y) ((*(x)) = (y))
+#define _MM_ADD_FP32(x,y) ((x) + (y))
+#define _MM_FMADD_FP32(x,y,z) (((x)*(y))+(z))
+#define _MM_MUL_FP32(x,y) ((x)*(y))
+#define _MM_PREFETCH(x, y)
+#define TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_A, ldA, ptr_B, ldB) ((*(ptr_B)) = (*(ptr_A)))
+#define TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_A, ldA, ptr_B, ldB) { \
+  uint16_t restmp = (*(ptr_A)); \
+  union { int i; float f; } res; \
+  res.i = restmp; \
+  res.i <<= 16; \
+  (*(ptr_B)) = res.f; \
+}
+
+#define COMPRESS_FP32(v, k, m, cnt) if (m) { \
+  values_ptr[cnt] = v; \
+  colidx_ptr[cnt] = (uint16_t)(k); \
+  cnt++; \
+}
+
+#define EXPAND_BFLOAT16(v, vlo_final, vhi_final) { \
+  union { int i; float f; } vlo_tmp, vhi_tmp; \
+  vlo_tmp.i = (v) & 0xFFFF; vlo_tmp.i <<= 16; \
+  vlo_final = vlo_tmp.f; \
+  vhi_tmp.i = (v) & 0x0000FFFF; \
+  vhi_final = vhi_tmp.f; \
+}
+
+#define COMPRESS_BFLOAT16(vlo, vhi, v) { \
+  union { int i; float f; } vlo_tmp, vhi_tmp; \
+  vlo_tmp.f = vlo; \
+  v = (vlo_tmp.i >> 16); \
+  vhi_tmp.f = vhi; \
+  v = v | (vhi_tmp.i & 0xFFFF0000); \
+}
+
--- a/third_party/libxsmm/src/libxsmm_spmdm_begin_avx2.h
+++ b/third_party/libxsmm/src/libxsmm_spmdm_begin_avx2.h
+/******************************************************************************
+* Copyright (c) Intel Corporation - All rights reserved.                      *
+* This file is part of the LIBXSMM library.                                   *
+*                                                                             *
+* For information on the license, see the LICENSE file.                       *
+* Further information: https://github.com/hfp/libxsmm/                        *
+* SPDX-License-Identifier: BSD-3-Clause                                       *
+******************************************************************************/
+/* Nadathur Satish, Hans Pabst (Intel Corp.)
+******************************************************************************/
+#if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
+# error "libxsmm_intrinsics_x86.h not included!"
+#endif
+
+#if (LIBXSMM_X86_AVX2 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
+#define SIMD_WIDTH_FP32 (8)
+#define SIMDTYPE_FP32 __m256
+#define SIMDTYPE_INT32 __m256i
+#define SIMDMASKTYPE_FP32 __m256
+#define _MM_SETZERO_FP32 _mm256_setzero_ps
+#define _MM_SETZERO_INT32 _mm256_setzero_si256
+#define _MM_SET1_FP32 _mm256_set1_ps
+#define _MM_SET1_INT32 _mm256_set1_epi32
+#define _MM_SET1_INT16 _mm256_set1_epi16
+#define _MM_SET_INT32 _mm256_set_epi32
+#define _MM_LOAD_FP32 _mm256_loadu_ps
+#define _MM_LOADU_FP32 _mm256_loadu_ps
+#define _MM_LOAD_INT32 _mm256_loadu_si256
+#define _MM_STORE_INT32 _mm256_storeu_si256
+#define _MM_LOADU_INT32(x) _mm256_loadu_si256( (__m256i const *)(x))
+#define _MM_GATHER_INT32(Addr, idx, scale) _mm256_i32gather_epi32((Addr), (idx), (scale))
+#define _MM_GATHER_FP32(Addr, idx, scale) _mm256_i32gather_ps(((float const *)(Addr)), (idx), (scale))
+#define _MM_CMPNEQ_FP32(v1,v2) _mm256_cmp_ps(v1,v2,12)
+#define _MM_STORE_FP32 _mm256_storeu_ps
+#define _MM_STOREU_FP32 _mm256_storeu_ps
+#define _MM_ADD_FP32 _mm256_add_ps
+#define _MM_FMADD_FP32 _mm256_fmadd_ps
+#define _MM_MUL_FP32 _mm256_mul_ps
+#define _MM_PREFETCH(x, y) _mm_prefetch(x, y)
+#define TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_A, ldA, ptr_B, ldB) { \
+  __m256 ymm9  = _mm256_loadu_ps(ptr_A); \
+  __m256 ymm10 = _mm256_loadu_ps(ptr_A + (size_t)ldA); \
+  __m256 ymm11 = _mm256_loadu_ps(ptr_A + (size_t)ldA*2); \
+  __m256 ymm12 = _mm256_loadu_ps(ptr_A + (size_t)ldA*3); \
+  __m256 ymm13 = _mm256_loadu_ps(ptr_A + (size_t)ldA*4); \
+  __m256 ymm14 = _mm256_loadu_ps(ptr_A + (size_t)ldA*5); \
+  __m256 ymm15 = _mm256_loadu_ps(ptr_A + (size_t)ldA*6); \
+  __m256 ymm2  = _mm256_loadu_ps(ptr_A + (size_t)ldA*7); \
+  __m256 ymm6  = _mm256_unpacklo_ps(ymm9, ymm10); \
+  __m256 ymm1  = _mm256_unpacklo_ps(ymm11, ymm12); \
+  __m256 ymm8  = _mm256_unpackhi_ps(ymm9, ymm10); \
+  __m256 ymm0  = _mm256_unpacklo_ps(ymm13, ymm14); \
+         ymm9  = _mm256_unpacklo_ps(ymm15, ymm2);{ \
+  __m256 ymm3  = _mm256_shuffle_ps(ymm6, ymm1, 0x4E); \
+         ymm10 = _mm256_blend_ps(ymm6, ymm3, 0xCC); \
+         ymm6  = _mm256_shuffle_ps(ymm0, ymm9, 0x4E);{ \
+  __m256 ymm7  = _mm256_unpackhi_ps(ymm11, ymm12); \
+         ymm11 = _mm256_blend_ps(ymm0, ymm6, 0xCC); \
+         ymm12 = _mm256_blend_ps(ymm3, ymm1, 0xCC); \
+         ymm3  = _mm256_permute2f128_ps(ymm10, ymm11, 0x20); \
+         _mm256_storeu_ps(ptr_B, ymm3);{ \
+  __m256 ymm5  = _mm256_unpackhi_ps(ymm13, ymm14); \
+         ymm13 = _mm256_blend_ps(ymm6, ymm9, 0xCC);{ \
+  __m256 ymm4  = _mm256_unpackhi_ps(ymm15, ymm2); \
+         ymm2  = _mm256_permute2f128_ps(ymm12, ymm13, 0x20); \
+         _mm256_storeu_ps(ptr_B + (size_t)ldB, ymm2); \
+         ymm14 = _mm256_shuffle_ps(ymm8, ymm7, 0x4E); \
+         ymm15 = _mm256_blend_ps(ymm14, ymm7, 0xCC); \
+         ymm7  = _mm256_shuffle_ps(ymm5, ymm4, 0x4E); \
+         ymm8  = _mm256_blend_ps(ymm8, ymm14, 0xCC); \
+         ymm5  = _mm256_blend_ps(ymm5, ymm7, 0xCC); \
+         ymm6  = _mm256_permute2f128_ps(ymm8, ymm5, 0x20); \
+         _mm256_storeu_ps(ptr_B + (size_t)ldB*2, ymm6); \
+         ymm4  = _mm256_blend_ps(ymm7, ymm4, 0xCC); \
+         ymm7  = _mm256_permute2f128_ps(ymm15, ymm4, 0x20); \
+         _mm256_storeu_ps(ptr_B + (size_t)ldB*3, ymm7); \
+         ymm1  = _mm256_permute2f128_ps(ymm10, ymm11, 0x31); \
+         ymm0  = _mm256_permute2f128_ps(ymm12, ymm13, 0x31); \
+         _mm256_storeu_ps(ptr_B + (size_t)ldB*4, ymm1); \
+         ymm5  = _mm256_permute2f128_ps(ymm8, ymm5, 0x31); \
+         ymm4  = _mm256_permute2f128_ps(ymm15, ymm4, 0x31); \
+         _mm256_storeu_ps(ptr_B + (size_t)ldB*5, ymm0); \
+         _mm256_storeu_ps(ptr_B + (size_t)ldB*6, ymm5); \
+         _mm256_storeu_ps(ptr_B + (size_t)ldB*7, ymm4);}}}} \
+}
+
+#define TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_A, ldA, ptr_B, ldB) { \
+  __m256 ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15, ymm2; \
+  __m256i vload_1 =  _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A))); \
+  vload_1 =  _mm256_inserti128_si256(vload_1, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA)), 1); \
+  EXPAND_BFLOAT16(vload_1, ymm9, ymm10);{ \
+  __m256i vload_2 =  _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*2))); \
+  vload_2 =  _mm256_inserti128_si256(vload_2, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*3)), 1); \
+  EXPAND_BFLOAT16(vload_2, ymm11, ymm12);{ \
+  __m256i vload_3 =  _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*4))); \
+  vload_3 =  _mm256_inserti128_si256(vload_3, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*5)), 1); \
+  EXPAND_BFLOAT16(vload_3, ymm13, ymm14);{ \
+  __m256i vload_4 =  _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*6))); \
+  vload_4 =  _mm256_inserti128_si256(vload_4, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*7)), 1); \
+  EXPAND_BFLOAT16(vload_4, ymm15, ymm2);{ \
+  __m256 ymm6  = _mm256_unpacklo_ps(ymm9, ymm10); \
+  __m256 ymm1  = _mm256_unpacklo_ps(ymm11, ymm12); \
+  __m256 ymm8  = _mm256_unpackhi_ps(ymm9, ymm10); \
+  __m256 ymm0  = _mm256_unpacklo_ps(ymm13, ymm14); \
+         ymm9  = _mm256_unpacklo_ps(ymm15, ymm2);{ \
+  __m256 ymm3  = _mm256_shuffle_ps(ymm6, ymm1, 0x4E); \
+         ymm10 = _mm256_blend_ps(ymm6, ymm3, 0xCC); \
+         ymm6  = _mm256_shuffle_ps(ymm0, ymm9, 0x4E);{ \
+  __m256 ymm7  = _mm256_unpackhi_ps(ymm11, ymm12); \
+         ymm11 = _mm256_blend_ps(ymm0, ymm6, 0xCC); \
+         ymm12 = _mm256_blend_ps(ymm3, ymm1, 0xCC); \
+         ymm3  = _mm256_permute2f128_ps(ymm10, ymm11, 0x20); \
+         _mm256_storeu_ps(ptr_B, ymm3);{ \
+  __m256 ymm5  = _mm256_unpackhi_ps(ymm13, ymm14); \
+         ymm13 = _mm256_blend_ps(ymm6, ymm9, 0xCC);{ \
+  __m256 ymm4  = _mm256_unpackhi_ps(ymm15, ymm2); \
+         ymm2  = _mm256_permute2f128_ps(ymm12, ymm13, 0x20); \
+         _mm256_storeu_ps(ptr_B + (size_t)ldB, ymm2); \
+         ymm14 = _mm256_shuffle_ps(ymm8, ymm7, 0x4E); \
+         ymm15 = _mm256_blend_ps(ymm14, ymm7, 0xCC); \
+         ymm7  = _mm256_shuffle_ps(ymm5, ymm4, 0x4E); \
+         ymm8  = _mm256_blend_ps(ymm8, ymm14, 0xCC); \
+         ymm5  = _mm256_blend_ps(ymm5, ymm7, 0xCC); \
+         ymm6  = _mm256_permute2f128_ps(ymm8, ymm5, 0x20); \
+         _mm256_storeu_ps(ptr_B + (size_t)ldB*2, ymm6); \
+         ymm4  = _mm256_blend_ps(ymm7, ymm4, 0xCC); \
+         ymm7  = _mm256_permute2f128_ps(ymm15, ymm4, 0x20); \
+         _mm256_storeu_ps(ptr_B + (size_t)ldB*3, ymm7); \
+         ymm1  = _mm256_permute2f128_ps(ymm10, ymm11, 0x31); \
+         ymm0  = _mm256_permute2f128_ps(ymm12, ymm13, 0x31); \
+         _mm256_storeu_ps(ptr_B + (size_t)ldB*4, ymm1); \
+         ymm5  = _mm256_permute2f128_ps(ymm8, ymm5, 0x31); \
+         ymm4  = _mm256_permute2f128_ps(ymm15, ymm4, 0x31); \
+         _mm256_storeu_ps(ptr_B + (size_t)ldB*5, ymm0); \
+         _mm256_storeu_ps(ptr_B + (size_t)ldB*6, ymm5); \
+         _mm256_storeu_ps(ptr_B + (size_t)ldB*7, ymm4);}}}}}}}} \
+}
+
+#define COMPRESS_FP32(v, k, m, cnt) { \
+  const unsigned int mask = _mm256_movemask_ps(m); \
+  const SIMDTYPE_INT32 vk = _MM_SET1_INT16((short)(k)); \
+  const __m256i perm_ctrl = _mm256_loadu_si256(&shufmasks[mask]); \
+  const __m256 v_packed = _mm256_permutevar8x32_ps(v, perm_ctrl); \
+  const __m256i v_shuff = _mm256_loadu_si256(&shufmasks2[mask]); \
+  const __m256i v_idx = _mm256_add_epi32(vk, v_shuff); \
+  _mm256_storeu_ps(values_ptr + (cnt), v_packed); \
+  _mm256_storeu_si256((__m256i *)(colidx_ptr + (cnt)), v_idx); \
+  cnt = (unsigned short)((cnt) + _mm_popcnt_u32(mask)); \
+}
+
+#define EXPAND_BFLOAT16(v, vlo_final, vhi_final) { \
+  const __m256i vlo = _mm256_unpacklo_epi16(vzero, v); \
+  const __m256i vhi = _mm256_unpackhi_epi16(vzero, v); \
+  vlo_final = _mm256_castsi256_ps(_mm256_permute2f128_si256(vlo, vhi, 0x20)); \
+  vhi_final = _mm256_castsi256_ps(_mm256_permute2f128_si256(vlo, vhi, 0x31)); \
+}
+
+#define COMPRESS_BFLOAT16(vlo, vhi, v) { \
+  const __m256i vtmp1 =  _mm256_castps_si256(_mm256_permute2f128_ps(vlo, vhi, 0x20)); \
+  const __m256i vtmp2 =  _mm256_castps_si256(_mm256_permute2f128_ps(vlo, vhi, 0x31)); \
+  const __m256i a = _mm256_srli_epi32(vtmp1, 16), b = _mm256_srli_epi32(vtmp2, 16); \
+  v = _mm256_packus_epi32(a, b); \
+}
+
+#endif
+
--- a/third_party/libxsmm/src/libxsmm_spmdm_begin_avx512.h
+++ b/third_party/libxsmm/src/libxsmm_spmdm_begin_avx512.h
+/******************************************************************************
+* Copyright (c) Intel Corporation - All rights reserved.                      *
+* This file is part of the LIBXSMM library.                                   *
+*                                                                             *
+* For information on the license, see the LICENSE file.                       *
+* Further information: https://github.com/hfp/libxsmm/                        *
+* SPDX-License-Identifier: BSD-3-Clause                                       *
+******************************************************************************/
+/* Nadathur Satish, Hans Pabst (Intel Corp.)
+******************************************************************************/
+#if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
+# error "libxsmm_intrinsics_x86.h not included!"
+#endif
+
+#if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
+#define SIMD_WIDTH_FP32 (16)
+#define SIMDTYPE_FP32 __m512
+#define SIMDTYPE_INT32 __m512i
+#define SIMDMASKTYPE_FP32 __mmask16
+#define _MM_SETZERO_FP32 _mm512_setzero_ps
+#define _MM_SETZERO_INT32 _mm512_setzero_epi32
+#define _MM_SET1_FP32 _mm512_set1_ps
+#define _MM_SET1_INT32 _mm512_set1_epi32
+#define _MM_SET1_INT16 _mm512_set1_epi16
+#define _MM_SET_INT32 _mm512_set_epi32
+#define _MM_LOAD_FP32 LIBXSMM_INTRINSICS_MM512_LOAD_PS
+#define _MM_LOADU_FP32 _mm512_loadu_ps
+#define _MM_LOAD_INT32 _mm512_loadu_si512
+#define _MM_STORE_INT32 _mm512_storeu_si512
+#define _MM_LOADU_INT32(x) _mm512_loadu_si512( (void const *)(x))
+#define _MM_GATHER_INT32(Addr, idx, scale) _mm512_i32gather_epi32((idx), (Addr), (scale))
+#define _MM_GATHER_FP32(Addr, idx, scale) _mm512_i32gather_ps((idx), (Addr), (scale))
+#define _MM_CMPNEQ_FP32(v1,v2) _mm512_cmp_ps_mask(v1,v2,12)
+#define _MM_STORE_FP32 _mm512_storeu_ps
+#define _MM_STOREU_FP32 _mm512_storeu_ps
+#define _MM_ADD_FP32 _mm512_add_ps
+#define _MM_FMADD_FP32 _mm512_fmadd_ps
+#define _MM_MUL_FP32 _mm512_mul_ps
+#define _MM_PREFETCH(x, y) _mm_prefetch(x, y)
+#define TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_A, ldA, ptr_B, ldB) { \
+  __m512 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; \
+  __m512 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; \
+  r0 = _mm512_loadu_ps(ptr_A); \
+  r1 = _mm512_loadu_ps(ptr_A + ldA); \
+  r2 = _mm512_loadu_ps(ptr_A + 2*ldA); \
+  r3 = _mm512_loadu_ps(ptr_A + 3*ldA); \
+  r4 = _mm512_loadu_ps(ptr_A + 4*ldA); \
+  r5 = _mm512_loadu_ps(ptr_A + 5*ldA); \
+  r6 = _mm512_loadu_ps(ptr_A + 6*ldA); \
+  r7 = _mm512_loadu_ps(ptr_A + 7*ldA); \
+  r8 = _mm512_loadu_ps(ptr_A + 8*ldA); \
+  r9 = _mm512_loadu_ps(ptr_A + 9*ldA); \
+  ra = _mm512_loadu_ps(ptr_A + 10*ldA); \
+  rb = _mm512_loadu_ps(ptr_A + 11*ldA); \
+  rc = _mm512_loadu_ps(ptr_A + 12*ldA); \
+  rd = _mm512_loadu_ps(ptr_A + 13*ldA); \
+  re = _mm512_loadu_ps(ptr_A + 14*ldA); \
+  rf = _mm512_loadu_ps(ptr_A + 15*ldA); \
+  \
+  t0 = _mm512_unpacklo_ps(r0,r1); \
+  t1 = _mm512_unpackhi_ps(r0,r1); \
+  t2 = _mm512_unpacklo_ps(r2,r3); \
+  t3 = _mm512_unpackhi_ps(r2,r3); \
+  t4 = _mm512_unpacklo_ps(r4,r5); \
+  t5 = _mm512_unpackhi_ps(r4,r5); \
+  t6 = _mm512_unpacklo_ps(r6,r7); \
+  t7 = _mm512_unpackhi_ps(r6,r7); \
+  t8 = _mm512_unpacklo_ps(r8,r9); \
+  t9 = _mm512_unpackhi_ps(r8,r9); \
+  ta = _mm512_unpacklo_ps(ra,rb); \
+  tb = _mm512_unpackhi_ps(ra,rb); \
+  tc = _mm512_unpacklo_ps(rc,rd); \
+  td = _mm512_unpackhi_ps(rc,rd); \
+  te = _mm512_unpacklo_ps(re,rf); \
+  tf = _mm512_unpackhi_ps(re,rf); \
+  \
+  { const __m512d td1 = _mm512_castps_pd(t0), td2 = _mm512_castps_pd(t2); \
+    r0 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    r1 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
+  { const __m512d td1 = _mm512_castps_pd(t1), td2 = _mm512_castps_pd(t3); \
+    r2 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    r3 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
+  { const __m512d td1 = _mm512_castps_pd(t4), td2 = _mm512_castps_pd(t6); \
+    r4 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    r5 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
+  { const __m512d td1 = _mm512_castps_pd(t5), td2 = _mm512_castps_pd(t7); \
+    r6 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    r7 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
+  { const __m512d td1 = _mm512_castps_pd(t8), td2 = _mm512_castps_pd(ta); \
+    r8 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    r9 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
+  { const __m512d td1 = _mm512_castps_pd(t9), td2 = _mm512_castps_pd(tb); \
+    ra = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    rb = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
+  { const __m512d td1 = _mm512_castps_pd(tc), td2 = _mm512_castps_pd(te); \
+    rc = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    rd = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
+  { const __m512d td1 = _mm512_castps_pd(td), td2 = _mm512_castps_pd(tf); \
+    re = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    rf = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
+  \
+  t0 = _mm512_shuffle_f32x4(r0, r4, 0x88); \
+  t1 = _mm512_shuffle_f32x4(r1, r5, 0x88); \
+  t2 = _mm512_shuffle_f32x4(r2, r6, 0x88); \
+  t3 = _mm512_shuffle_f32x4(r3, r7, 0x88); \
+  t4 = _mm512_shuffle_f32x4(r0, r4, 0xdd); \
+  t5 = _mm512_shuffle_f32x4(r1, r5, 0xdd); \
+  t6 = _mm512_shuffle_f32x4(r2, r6, 0xdd); \
+  t7 = _mm512_shuffle_f32x4(r3, r7, 0xdd); \
+  t8 = _mm512_shuffle_f32x4(r8, rc, 0x88); \
+  t9 = _mm512_shuffle_f32x4(r9, rd, 0x88); \
+  ta = _mm512_shuffle_f32x4(ra, re, 0x88); \
+  tb = _mm512_shuffle_f32x4(rb, rf, 0x88); \
+  tc = _mm512_shuffle_f32x4(r8, rc, 0xdd); \
+  td = _mm512_shuffle_f32x4(r9, rd, 0xdd); \
+  te = _mm512_shuffle_f32x4(ra, re, 0xdd); \
+  tf = _mm512_shuffle_f32x4(rb, rf, 0xdd); \
+  \
+  r0 = _mm512_shuffle_f32x4(t0, t8, 0x88); \
+  r1 = _mm512_shuffle_f32x4(t1, t9, 0x88); \
+  r2 = _mm512_shuffle_f32x4(t2, ta, 0x88); \
+  r3 = _mm512_shuffle_f32x4(t3, tb, 0x88); \
+  r4 = _mm512_shuffle_f32x4(t4, tc, 0x88); \
+  r5 = _mm512_shuffle_f32x4(t5, td, 0x88); \
+  r6 = _mm512_shuffle_f32x4(t6, te, 0x88); \
+  r7 = _mm512_shuffle_f32x4(t7, tf, 0x88); \
+  r8 = _mm512_shuffle_f32x4(t0, t8, 0xdd); \
+  r9 = _mm512_shuffle_f32x4(t1, t9, 0xdd); \
+  ra = _mm512_shuffle_f32x4(t2, ta, 0xdd); \
+  rb = _mm512_shuffle_f32x4(t3, tb, 0xdd); \
+  rc = _mm512_shuffle_f32x4(t4, tc, 0xdd); \
+  rd = _mm512_shuffle_f32x4(t5, td, 0xdd); \
+  re = _mm512_shuffle_f32x4(t6, te, 0xdd); \
+  rf = _mm512_shuffle_f32x4(t7, tf, 0xdd); \
+  \
+  _mm512_storeu_ps(ptr_B + 0*ldB, r0); \
+  _mm512_storeu_ps(ptr_B + 1*ldB, r1); \
+  _mm512_storeu_ps(ptr_B + 2*ldB, r2); \
+  _mm512_storeu_ps(ptr_B + 3*ldB, r3); \
+  _mm512_storeu_ps(ptr_B + 4*ldB, r4); \
+  _mm512_storeu_ps(ptr_B + 5*ldB, r5); \
+  _mm512_storeu_ps(ptr_B + 6*ldB, r6); \
+  _mm512_storeu_ps(ptr_B + 7*ldB, r7); \
+  _mm512_storeu_ps(ptr_B + 8*ldB, r8); \
+  _mm512_storeu_ps(ptr_B + 9*ldB, r9); \
+  _mm512_storeu_ps(ptr_B + 10*ldB, ra); \
+  _mm512_storeu_ps(ptr_B + 11*ldB, rb); \
+  _mm512_storeu_ps(ptr_B + 12*ldB, rc); \
+  _mm512_storeu_ps(ptr_B + 13*ldB, rd); \
+  _mm512_storeu_ps(ptr_B + 14*ldB, re); \
+  _mm512_storeu_ps(ptr_B + 15*ldB, rf); \
+}
+
+#define TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_A, ldA, ptr_B, ldB) { \
+  __m512 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; \
+  __m512 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; \
+  __m512i vload_1 =  _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A))); \
+  vload_1 =  _mm512_inserti32x8(vload_1, _mm256_loadu_si256((const __m256i*)(ptr_A + ldA)), 1); \
+  EXPAND_BFLOAT16(vload_1, r0, r1);{ \
+  __m512i vload_2 =  _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 2*ldA))); \
+  vload_2 =  _mm512_inserti32x8(vload_2, _mm256_loadu_si256((const __m256i*)(ptr_A + 3*ldA)), 1); \
+  EXPAND_BFLOAT16(vload_2, r2, r3);{ \
+  __m512i vload_3 =  _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 4*ldA))); \
+  vload_3 =  _mm512_inserti32x8(vload_3, _mm256_loadu_si256((const __m256i*)(ptr_A + 5*ldA)), 1); \
+  EXPAND_BFLOAT16(vload_3, r4, r5);{ \
+  __m512i vload_4 =  _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 6*ldA))); \
+  vload_4 =  _mm512_inserti32x8(vload_4, _mm256_loadu_si256((const __m256i*)(ptr_A + 7*ldA)), 1); \
+  EXPAND_BFLOAT16(vload_4, r6, r7);{ \
+  __m512i vload_5 =  _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 8*ldA))); \
+  vload_5 =  _mm512_inserti32x8(vload_5, _mm256_loadu_si256((const __m256i*)(ptr_A + 9*ldA)), 1); \
+  EXPAND_BFLOAT16(vload_5, r8, r9);{ \
+  __m512i vload_6 =  _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 10*ldA))); \
+  vload_6 =  _mm512_inserti32x8(vload_6, _mm256_loadu_si256((const __m256i*)(ptr_A + 11*ldA)), 1); \
+  EXPAND_BFLOAT16(vload_6, ra, rb);{ \
+  __m512i vload_7 =  _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 12*ldA))); \
+  vload_7 =  _mm512_inserti32x8(vload_7, _mm256_loadu_si256((const __m256i*)(ptr_A + 13*ldA)), 1); \
+  EXPAND_BFLOAT16(vload_7, rc, rd);{ \
+  __m512i vload_8 =  _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 14*ldA))); \
+  vload_8 =  _mm512_inserti32x8(vload_8, _mm256_loadu_si256((const __m256i*)(ptr_A + 15*ldA)), 1); \
+  EXPAND_BFLOAT16(vload_8, re, rf); \
+  \
+  t0 = _mm512_unpacklo_ps(r0,r1); \
+  t1 = _mm512_unpackhi_ps(r0,r1); \
+  t2 = _mm512_unpacklo_ps(r2,r3); \
+  t3 = _mm512_unpackhi_ps(r2,r3); \
+  t4 = _mm512_unpacklo_ps(r4,r5); \
+  t5 = _mm512_unpackhi_ps(r4,r5); \
+  t6 = _mm512_unpacklo_ps(r6,r7); \
+  t7 = _mm512_unpackhi_ps(r6,r7); \
+  t8 = _mm512_unpacklo_ps(r8,r9); \
+  t9 = _mm512_unpackhi_ps(r8,r9); \
+  ta = _mm512_unpacklo_ps(ra,rb); \
+  tb = _mm512_unpackhi_ps(ra,rb); \
+  tc = _mm512_unpacklo_ps(rc,rd); \
+  td = _mm512_unpackhi_ps(rc,rd); \
+  te = _mm512_unpacklo_ps(re,rf); \
+  tf = _mm512_unpackhi_ps(re,rf); \
+  \
+  { const __m512d td1 = _mm512_castps_pd(t0), td2 = _mm512_castps_pd(t2); \
+    r0 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    r1 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
+  { const __m512d td1 = _mm512_castps_pd(t1), td2 = _mm512_castps_pd(t3); \
+    r2 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    r3 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
+  { const __m512d td1 = _mm512_castps_pd(t4), td2 = _mm512_castps_pd(t6); \
+    r4 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    r5 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
+  { const __m512d td1 = _mm512_castps_pd(t5), td2 = _mm512_castps_pd(t7); \
+    r6 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    r7 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
+  { const __m512d td1 = _mm512_castps_pd(t8), td2 = _mm512_castps_pd(ta); \
+    r8 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    r9 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
+  { const __m512d td1 = _mm512_castps_pd(t9), td2 = _mm512_castps_pd(tb); \
+    ra = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    rb = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
+  { const __m512d td1 = _mm512_castps_pd(tc), td2 = _mm512_castps_pd(te); \
+    rc = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    rd = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
+  { const __m512d td1 = _mm512_castps_pd(td), td2 = _mm512_castps_pd(tf); \
+    re = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
+    rf = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
+  \
+  t0 = _mm512_shuffle_f32x4(r0, r4, 0x88); \
+  t1 = _mm512_shuffle_f32x4(r1, r5, 0x88); \
+  t2 = _mm512_shuffle_f32x4(r2, r6, 0x88); \
+  t3 = _mm512_shuffle_f32x4(r3, r7, 0x88); \
+  t4 = _mm512_shuffle_f32x4(r0, r4, 0xdd); \
+  t5 = _mm512_shuffle_f32x4(r1, r5, 0xdd); \
+  t6 = _mm512_shuffle_f32x4(r2, r6, 0xdd); \
+  t7 = _mm512_shuffle_f32x4(r3, r7, 0xdd); \
+  t8 = _mm512_shuffle_f32x4(r8, rc, 0x88); \
+  t9 = _mm512_shuffle_f32x4(r9, rd, 0x88); \
+  ta = _mm512_shuffle_f32x4(ra, re, 0x88); \
+  tb = _mm512_shuffle_f32x4(rb, rf, 0x88); \
+  tc = _mm512_shuffle_f32x4(r8, rc, 0xdd); \
+  td = _mm512_shuffle_f32x4(r9, rd, 0xdd); \
+  te = _mm512_shuffle_f32x4(ra, re, 0xdd); \
+  tf = _mm512_shuffle_f32x4(rb, rf, 0xdd); \
+  \
+  r0 = _mm512_shuffle_f32x4(t0, t8, 0x88); \
+  r1 = _mm512_shuffle_f32x4(t1, t9, 0x88); \
+  r2 = _mm512_shuffle_f32x4(t2, ta, 0x88); \
+  r3 = _mm512_shuffle_f32x4(t3, tb, 0x88); \
+  r4 = _mm512_shuffle_f32x4(t4, tc, 0x88); \
+  r5 = _mm512_shuffle_f32x4(t5, td, 0x88); \
+  r6 = _mm512_shuffle_f32x4(t6, te, 0x88); \
+  r7 = _mm512_shuffle_f32x4(t7, tf, 0x88); \
+  r8 = _mm512_shuffle_f32x4(t0, t8, 0xdd); \
+  r9 = _mm512_shuffle_f32x4(t1, t9, 0xdd); \
+  ra = _mm512_shuffle_f32x4(t2, ta, 0xdd); \
+  rb = _mm512_shuffle_f32x4(t3, tb, 0xdd); \
+  rc = _mm512_shuffle_f32x4(t4, tc, 0xdd); \
+  rd = _mm512_shuffle_f32x4(t5, td, 0xdd); \
+  re = _mm512_shuffle_f32x4(t6, te, 0xdd); \
+  rf = _mm512_shuffle_f32x4(t7, tf, 0xdd); \
+  \
+  _mm512_storeu_ps(ptr_B + 0*ldB, r0); \
+  _mm512_storeu_ps(ptr_B + 1*ldB, r1); \
+  _mm512_storeu_ps(ptr_B + 2*ldB, r2); \
+  _mm512_storeu_ps(ptr_B + 3*ldB, r3); \
+  _mm512_storeu_ps(ptr_B + 4*ldB, r4); \
+  _mm512_storeu_ps(ptr_B + 5*ldB, r5); \
+  _mm512_storeu_ps(ptr_B + 6*ldB, r6); \
+  _mm512_storeu_ps(ptr_B + 7*ldB, r7); \
+  _mm512_storeu_ps(ptr_B + 8*ldB, r8); \
+  _mm512_storeu_ps(ptr_B + 9*ldB, r9); \
+  _mm512_storeu_ps(ptr_B + 10*ldB, ra); \
+  _mm512_storeu_ps(ptr_B + 11*ldB, rb); \
+  _mm512_storeu_ps(ptr_B + 12*ldB, rc); \
+  _mm512_storeu_ps(ptr_B + 13*ldB, rd); \
+  _mm512_storeu_ps(ptr_B + 14*ldB, re); \
+  _mm512_storeu_ps(ptr_B + 15*ldB, rf);}}}}}}} \
+}
+
+#define COMPRESS_FP32(v, k, m, cnt) { \
+  _mm512_mask_compressstoreu_ps(values_ptr + (cnt), m, v); \
+  { \
+    __m256i vk1 = _mm256_set1_epi16((short)(k)); \
+    __m256i vk2 = _mm256_set1_epi16((short)((k) + 8)); \
+    __m256i v_idx = _mm256_add_epi32(vk1, _mm256_loadu_si256(&shufmasks2[(m)&0xFF])); \
+    __m256i v_idx_2 = _mm256_add_epi32(vk2, _mm256_loadu_si256(&shufmasks2[((m)>>8)&0xFF])); \
+    _mm256_storeu_si256((__m256i *)(colidx_ptr + (cnt)), v_idx); \
+    cnt = (unsigned short)((cnt) + _mm_popcnt_u32((m)&0xFF)); \
+    _mm256_storeu_si256((__m256i *)(colidx_ptr + (cnt)), v_idx_2); \
+    cnt = (unsigned short)((cnt) + _mm_popcnt_u32(((m)>>8)&0xFF)); \
+  } \
+}
+
+#define EXPAND_BFLOAT16(v, vlo_final, vhi_final) { \
+  const __m512i vlo = _mm512_unpacklo_epi16(vzero, v); \
+  const __m512i vhi = _mm512_unpackhi_epi16(vzero, v); \
+  const __m512i permmask1 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0); \
+  const __m512i permmask2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); \
+  vlo_final = _mm512_castsi512_ps(_mm512_permutex2var_epi64(vlo, permmask1, vhi)); \
+  vhi_final = _mm512_castsi512_ps(_mm512_permutex2var_epi64(vlo, permmask2, vhi)); \
+}
+
+#define COMPRESS_BFLOAT16(vlo, vhi, v) { \
+  const __m512i permmask1 = _mm512_set_epi64(13, 12, 9, 8, 5, 4, 1, 0); \
+  const __m512i permmask2 = _mm512_set_epi64(15, 14, 11, 10, 7, 6, 3, 2); \
+  const __m512i va = _mm512_castps_si512(vlo), vb = _mm512_castps_si512(vhi); \
+  const __m512i vtmp1 =  _mm512_permutex2var_epi64(va, permmask1, vb); \
+  const __m512i vtmp2 =  _mm512_permutex2var_epi64(va, permmask2, vb); \
+  const __m512i a = _mm512_srli_epi32(vtmp1, 16), b = _mm512_srli_epi32(vtmp2, 16); \
+  v = _mm512_packus_epi32(a, b); \
+}
+
+#endif
+
--- a/third_party/libxsmm/src/libxsmm_spmdm_end.h
+++ b/third_party/libxsmm/src/libxsmm_spmdm_end.h
+/******************************************************************************
+* Copyright (c) Intel Corporation - All rights reserved.                      *
+* This file is part of the LIBXSMM library.                                   *
+*                                                                             *
+* For information on the license, see the LICENSE file.                       *
+* Further information: https://github.com/hfp/libxsmm/                        *
+* SPDX-License-Identifier: BSD-3-Clause                                       *
+******************************************************************************/
+/* Hans Pabst (Intel Corp.)
+******************************************************************************/
+
+#undef SIMD_WIDTH_FP32
+#undef SIMDTYPE_FP32
+#undef SIMDTYPE_INT32
+#undef SIMDMASKTYPE_FP32
+#undef _MM_SETZERO_FP32
+#undef _MM_SETZERO_INT32
+#undef _MM_SET1_FP32
+#undef _MM_SET1_INT32
+#undef _MM_SET1_INT16
+#undef _MM_SET_INT32
+#undef _MM_LOAD_FP32
+#undef _MM_LOADU_FP32
+#undef _MM_LOAD_INT32
+#undef _MM_STORE_INT32
+#undef _MM_LOADU_INT32
+#undef _MM_GATHER_INT32
+#undef _MM_GATHER_FP32
+#undef _MM_CMPNEQ_FP32
+#undef _MM_STORE_FP32
+#undef _MM_STOREU_FP32
+#undef _MM_ADD_FP32
+#undef _MM_FMADD_FP32
+#undef _MM_MUL_FP32
+#undef _MM_PREFETCH
+#undef TRANSPOSE_SIMD_WIDTH_KERNEL
+#undef TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16
+#undef COMPRESS_FP32
+#undef EXPAND_BFLOAT16
+#undef COMPRESS_BFLOAT16
+#undef num_regs
+