Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
c454d419
Commit
c454d419
authored
May 12, 2023
by
lisj
Browse files
删除子模块的gitignore
parent
3359c1f1
Changes
264
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
582 additions
and
0 deletions
+582
-0
third_party/libxsmm/src/libxsmm_spmdm_begin.h
third_party/libxsmm/src/libxsmm_spmdm_begin.h
+64
-0
third_party/libxsmm/src/libxsmm_spmdm_begin_avx2.h
third_party/libxsmm/src/libxsmm_spmdm_begin_avx2.h
+166
-0
third_party/libxsmm/src/libxsmm_spmdm_begin_avx512.h
third_party/libxsmm/src/libxsmm_spmdm_begin_avx512.h
+310
-0
third_party/libxsmm/src/libxsmm_spmdm_end.h
third_party/libxsmm/src/libxsmm_spmdm_end.h
+42
-0
No files found.
Too many changes to show.
To preserve performance only
264 of 264+
files are displayed.
Plain diff
Email patch
third_party/libxsmm/src/libxsmm_spmdm_begin.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Nadathur Satish, Hans Pabst (Intel Corp.)
******************************************************************************/
#define SIMD_WIDTH_FP32 (1)
#define SIMDTYPE_FP32 float
#define SIMDTYPE_INT32 int
#define SIMDMASKTYPE_FP32 int
#define _MM_SETZERO_FP32() (0)
#define _MM_SETZERO_INT32() (0)
#define _MM_SET1_FP32(x) (x)
#define _MM_SET1_INT32(x) (x)
#define _MM_SET1_INT16 (x)
#define _MM_LOAD_FP32(x) (*(x))
#define _MM_LOADU_FP32(x) (*(x))
#define _MM_LOAD_INT32(x) (*(x))
#define _MM_STORE_INT32(x,y) ((*(x)) = (y))
#define _MM_LOADU_INT32(x) (*(x))
#define _MM_GATHER_FP32(Addr, idx, scale) (*(Addr + (idx)))
#define _MM_CMPNEQ_FP32(v1,v2) (LIBXSMM_FEQ(v1, v2) ? 0 : 1)
#define _MM_STORE_FP32(x,y) ((*(x)) = (y))
#define _MM_STOREU_FP32(x,y) ((*(x)) = (y))
#define _MM_ADD_FP32(x,y) ((x) + (y))
#define _MM_FMADD_FP32(x,y,z) (((x)*(y))+(z))
#define _MM_MUL_FP32(x,y) ((x)*(y))
#define _MM_PREFETCH(x, y)
#define TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_A, ldA, ptr_B, ldB) ((*(ptr_B)) = (*(ptr_A)))
#define TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_A, ldA, ptr_B, ldB) { \
uint16_t restmp = (*(ptr_A)); \
union { int i; float f; } res; \
res.i = restmp; \
res.i <<= 16; \
(*(ptr_B)) = res.f; \
}
#define COMPRESS_FP32(v, k, m, cnt) if (m) { \
values_ptr[cnt] = v; \
colidx_ptr[cnt] = (uint16_t)(k); \
cnt++; \
}
#define EXPAND_BFLOAT16(v, vlo_final, vhi_final) { \
union { int i; float f; } vlo_tmp, vhi_tmp; \
vlo_tmp.i = (v) & 0xFFFF; vlo_tmp.i <<= 16; \
vlo_final = vlo_tmp.f; \
vhi_tmp.i = (v) & 0x0000FFFF; \
vhi_final = vhi_tmp.f; \
}
#define COMPRESS_BFLOAT16(vlo, vhi, v) { \
union { int i; float f; } vlo_tmp, vhi_tmp; \
vlo_tmp.f = vlo; \
v = (vlo_tmp.i >> 16); \
vhi_tmp.f = vhi; \
v = v | (vhi_tmp.i & 0xFFFF0000); \
}
third_party/libxsmm/src/libxsmm_spmdm_begin_avx2.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Nadathur Satish, Hans Pabst (Intel Corp.)
******************************************************************************/
#if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# error "libxsmm_intrinsics_x86.h not included!"
#endif
#if (LIBXSMM_X86_AVX2 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
#define SIMD_WIDTH_FP32 (8)
#define SIMDTYPE_FP32 __m256
#define SIMDTYPE_INT32 __m256i
#define SIMDMASKTYPE_FP32 __m256
#define _MM_SETZERO_FP32 _mm256_setzero_ps
#define _MM_SETZERO_INT32 _mm256_setzero_si256
#define _MM_SET1_FP32 _mm256_set1_ps
#define _MM_SET1_INT32 _mm256_set1_epi32
#define _MM_SET1_INT16 _mm256_set1_epi16
#define _MM_SET_INT32 _mm256_set_epi32
#define _MM_LOAD_FP32 _mm256_loadu_ps
#define _MM_LOADU_FP32 _mm256_loadu_ps
#define _MM_LOAD_INT32 _mm256_loadu_si256
#define _MM_STORE_INT32 _mm256_storeu_si256
#define _MM_LOADU_INT32(x) _mm256_loadu_si256( (__m256i const *)(x))
#define _MM_GATHER_INT32(Addr, idx, scale) _mm256_i32gather_epi32((Addr), (idx), (scale))
#define _MM_GATHER_FP32(Addr, idx, scale) _mm256_i32gather_ps(((float const *)(Addr)), (idx), (scale))
#define _MM_CMPNEQ_FP32(v1,v2) _mm256_cmp_ps(v1,v2,12)
#define _MM_STORE_FP32 _mm256_storeu_ps
#define _MM_STOREU_FP32 _mm256_storeu_ps
#define _MM_ADD_FP32 _mm256_add_ps
#define _MM_FMADD_FP32 _mm256_fmadd_ps
#define _MM_MUL_FP32 _mm256_mul_ps
#define _MM_PREFETCH(x, y) _mm_prefetch(x, y)
#define TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_A, ldA, ptr_B, ldB) { \
__m256 ymm9 = _mm256_loadu_ps(ptr_A); \
__m256 ymm10 = _mm256_loadu_ps(ptr_A + (size_t)ldA); \
__m256 ymm11 = _mm256_loadu_ps(ptr_A + (size_t)ldA*2); \
__m256 ymm12 = _mm256_loadu_ps(ptr_A + (size_t)ldA*3); \
__m256 ymm13 = _mm256_loadu_ps(ptr_A + (size_t)ldA*4); \
__m256 ymm14 = _mm256_loadu_ps(ptr_A + (size_t)ldA*5); \
__m256 ymm15 = _mm256_loadu_ps(ptr_A + (size_t)ldA*6); \
__m256 ymm2 = _mm256_loadu_ps(ptr_A + (size_t)ldA*7); \
__m256 ymm6 = _mm256_unpacklo_ps(ymm9, ymm10); \
__m256 ymm1 = _mm256_unpacklo_ps(ymm11, ymm12); \
__m256 ymm8 = _mm256_unpackhi_ps(ymm9, ymm10); \
__m256 ymm0 = _mm256_unpacklo_ps(ymm13, ymm14); \
ymm9 = _mm256_unpacklo_ps(ymm15, ymm2);{ \
__m256 ymm3 = _mm256_shuffle_ps(ymm6, ymm1, 0x4E); \
ymm10 = _mm256_blend_ps(ymm6, ymm3, 0xCC); \
ymm6 = _mm256_shuffle_ps(ymm0, ymm9, 0x4E);{ \
__m256 ymm7 = _mm256_unpackhi_ps(ymm11, ymm12); \
ymm11 = _mm256_blend_ps(ymm0, ymm6, 0xCC); \
ymm12 = _mm256_blend_ps(ymm3, ymm1, 0xCC); \
ymm3 = _mm256_permute2f128_ps(ymm10, ymm11, 0x20); \
_mm256_storeu_ps(ptr_B, ymm3);{ \
__m256 ymm5 = _mm256_unpackhi_ps(ymm13, ymm14); \
ymm13 = _mm256_blend_ps(ymm6, ymm9, 0xCC);{ \
__m256 ymm4 = _mm256_unpackhi_ps(ymm15, ymm2); \
ymm2 = _mm256_permute2f128_ps(ymm12, ymm13, 0x20); \
_mm256_storeu_ps(ptr_B + (size_t)ldB, ymm2); \
ymm14 = _mm256_shuffle_ps(ymm8, ymm7, 0x4E); \
ymm15 = _mm256_blend_ps(ymm14, ymm7, 0xCC); \
ymm7 = _mm256_shuffle_ps(ymm5, ymm4, 0x4E); \
ymm8 = _mm256_blend_ps(ymm8, ymm14, 0xCC); \
ymm5 = _mm256_blend_ps(ymm5, ymm7, 0xCC); \
ymm6 = _mm256_permute2f128_ps(ymm8, ymm5, 0x20); \
_mm256_storeu_ps(ptr_B + (size_t)ldB*2, ymm6); \
ymm4 = _mm256_blend_ps(ymm7, ymm4, 0xCC); \
ymm7 = _mm256_permute2f128_ps(ymm15, ymm4, 0x20); \
_mm256_storeu_ps(ptr_B + (size_t)ldB*3, ymm7); \
ymm1 = _mm256_permute2f128_ps(ymm10, ymm11, 0x31); \
ymm0 = _mm256_permute2f128_ps(ymm12, ymm13, 0x31); \
_mm256_storeu_ps(ptr_B + (size_t)ldB*4, ymm1); \
ymm5 = _mm256_permute2f128_ps(ymm8, ymm5, 0x31); \
ymm4 = _mm256_permute2f128_ps(ymm15, ymm4, 0x31); \
_mm256_storeu_ps(ptr_B + (size_t)ldB*5, ymm0); \
_mm256_storeu_ps(ptr_B + (size_t)ldB*6, ymm5); \
_mm256_storeu_ps(ptr_B + (size_t)ldB*7, ymm4);}}}} \
}
#define TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_A, ldA, ptr_B, ldB) { \
__m256 ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15, ymm2; \
__m256i vload_1 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A))); \
vload_1 = _mm256_inserti128_si256(vload_1, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA)), 1); \
EXPAND_BFLOAT16(vload_1, ymm9, ymm10);{ \
__m256i vload_2 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*2))); \
vload_2 = _mm256_inserti128_si256(vload_2, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*3)), 1); \
EXPAND_BFLOAT16(vload_2, ymm11, ymm12);{ \
__m256i vload_3 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*4))); \
vload_3 = _mm256_inserti128_si256(vload_3, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*5)), 1); \
EXPAND_BFLOAT16(vload_3, ymm13, ymm14);{ \
__m256i vload_4 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*6))); \
vload_4 = _mm256_inserti128_si256(vload_4, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*7)), 1); \
EXPAND_BFLOAT16(vload_4, ymm15, ymm2);{ \
__m256 ymm6 = _mm256_unpacklo_ps(ymm9, ymm10); \
__m256 ymm1 = _mm256_unpacklo_ps(ymm11, ymm12); \
__m256 ymm8 = _mm256_unpackhi_ps(ymm9, ymm10); \
__m256 ymm0 = _mm256_unpacklo_ps(ymm13, ymm14); \
ymm9 = _mm256_unpacklo_ps(ymm15, ymm2);{ \
__m256 ymm3 = _mm256_shuffle_ps(ymm6, ymm1, 0x4E); \
ymm10 = _mm256_blend_ps(ymm6, ymm3, 0xCC); \
ymm6 = _mm256_shuffle_ps(ymm0, ymm9, 0x4E);{ \
__m256 ymm7 = _mm256_unpackhi_ps(ymm11, ymm12); \
ymm11 = _mm256_blend_ps(ymm0, ymm6, 0xCC); \
ymm12 = _mm256_blend_ps(ymm3, ymm1, 0xCC); \
ymm3 = _mm256_permute2f128_ps(ymm10, ymm11, 0x20); \
_mm256_storeu_ps(ptr_B, ymm3);{ \
__m256 ymm5 = _mm256_unpackhi_ps(ymm13, ymm14); \
ymm13 = _mm256_blend_ps(ymm6, ymm9, 0xCC);{ \
__m256 ymm4 = _mm256_unpackhi_ps(ymm15, ymm2); \
ymm2 = _mm256_permute2f128_ps(ymm12, ymm13, 0x20); \
_mm256_storeu_ps(ptr_B + (size_t)ldB, ymm2); \
ymm14 = _mm256_shuffle_ps(ymm8, ymm7, 0x4E); \
ymm15 = _mm256_blend_ps(ymm14, ymm7, 0xCC); \
ymm7 = _mm256_shuffle_ps(ymm5, ymm4, 0x4E); \
ymm8 = _mm256_blend_ps(ymm8, ymm14, 0xCC); \
ymm5 = _mm256_blend_ps(ymm5, ymm7, 0xCC); \
ymm6 = _mm256_permute2f128_ps(ymm8, ymm5, 0x20); \
_mm256_storeu_ps(ptr_B + (size_t)ldB*2, ymm6); \
ymm4 = _mm256_blend_ps(ymm7, ymm4, 0xCC); \
ymm7 = _mm256_permute2f128_ps(ymm15, ymm4, 0x20); \
_mm256_storeu_ps(ptr_B + (size_t)ldB*3, ymm7); \
ymm1 = _mm256_permute2f128_ps(ymm10, ymm11, 0x31); \
ymm0 = _mm256_permute2f128_ps(ymm12, ymm13, 0x31); \
_mm256_storeu_ps(ptr_B + (size_t)ldB*4, ymm1); \
ymm5 = _mm256_permute2f128_ps(ymm8, ymm5, 0x31); \
ymm4 = _mm256_permute2f128_ps(ymm15, ymm4, 0x31); \
_mm256_storeu_ps(ptr_B + (size_t)ldB*5, ymm0); \
_mm256_storeu_ps(ptr_B + (size_t)ldB*6, ymm5); \
_mm256_storeu_ps(ptr_B + (size_t)ldB*7, ymm4);}}}}}}}} \
}
#define COMPRESS_FP32(v, k, m, cnt) { \
const unsigned int mask = _mm256_movemask_ps(m); \
const SIMDTYPE_INT32 vk = _MM_SET1_INT16((short)(k)); \
const __m256i perm_ctrl = _mm256_loadu_si256(&shufmasks[mask]); \
const __m256 v_packed = _mm256_permutevar8x32_ps(v, perm_ctrl); \
const __m256i v_shuff = _mm256_loadu_si256(&shufmasks2[mask]); \
const __m256i v_idx = _mm256_add_epi32(vk, v_shuff); \
_mm256_storeu_ps(values_ptr + (cnt), v_packed); \
_mm256_storeu_si256((__m256i *)(colidx_ptr + (cnt)), v_idx); \
cnt = (unsigned short)((cnt) + _mm_popcnt_u32(mask)); \
}
#define EXPAND_BFLOAT16(v, vlo_final, vhi_final) { \
const __m256i vlo = _mm256_unpacklo_epi16(vzero, v); \
const __m256i vhi = _mm256_unpackhi_epi16(vzero, v); \
vlo_final = _mm256_castsi256_ps(_mm256_permute2f128_si256(vlo, vhi, 0x20)); \
vhi_final = _mm256_castsi256_ps(_mm256_permute2f128_si256(vlo, vhi, 0x31)); \
}
#define COMPRESS_BFLOAT16(vlo, vhi, v) { \
const __m256i vtmp1 = _mm256_castps_si256(_mm256_permute2f128_ps(vlo, vhi, 0x20)); \
const __m256i vtmp2 = _mm256_castps_si256(_mm256_permute2f128_ps(vlo, vhi, 0x31)); \
const __m256i a = _mm256_srli_epi32(vtmp1, 16), b = _mm256_srli_epi32(vtmp2, 16); \
v = _mm256_packus_epi32(a, b); \
}
#endif
third_party/libxsmm/src/libxsmm_spmdm_begin_avx512.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Nadathur Satish, Hans Pabst (Intel Corp.)
******************************************************************************/
#if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# error "libxsmm_intrinsics_x86.h not included!"
#endif
#if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
#define SIMD_WIDTH_FP32 (16)
#define SIMDTYPE_FP32 __m512
#define SIMDTYPE_INT32 __m512i
#define SIMDMASKTYPE_FP32 __mmask16
#define _MM_SETZERO_FP32 _mm512_setzero_ps
#define _MM_SETZERO_INT32 _mm512_setzero_epi32
#define _MM_SET1_FP32 _mm512_set1_ps
#define _MM_SET1_INT32 _mm512_set1_epi32
#define _MM_SET1_INT16 _mm512_set1_epi16
#define _MM_SET_INT32 _mm512_set_epi32
#define _MM_LOAD_FP32 LIBXSMM_INTRINSICS_MM512_LOAD_PS
#define _MM_LOADU_FP32 _mm512_loadu_ps
#define _MM_LOAD_INT32 _mm512_loadu_si512
#define _MM_STORE_INT32 _mm512_storeu_si512
#define _MM_LOADU_INT32(x) _mm512_loadu_si512( (void const *)(x))
#define _MM_GATHER_INT32(Addr, idx, scale) _mm512_i32gather_epi32((idx), (Addr), (scale))
#define _MM_GATHER_FP32(Addr, idx, scale) _mm512_i32gather_ps((idx), (Addr), (scale))
#define _MM_CMPNEQ_FP32(v1,v2) _mm512_cmp_ps_mask(v1,v2,12)
#define _MM_STORE_FP32 _mm512_storeu_ps
#define _MM_STOREU_FP32 _mm512_storeu_ps
#define _MM_ADD_FP32 _mm512_add_ps
#define _MM_FMADD_FP32 _mm512_fmadd_ps
#define _MM_MUL_FP32 _mm512_mul_ps
#define _MM_PREFETCH(x, y) _mm_prefetch(x, y)
#define TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_A, ldA, ptr_B, ldB) { \
__m512 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; \
__m512 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; \
r0 = _mm512_loadu_ps(ptr_A); \
r1 = _mm512_loadu_ps(ptr_A + ldA); \
r2 = _mm512_loadu_ps(ptr_A + 2*ldA); \
r3 = _mm512_loadu_ps(ptr_A + 3*ldA); \
r4 = _mm512_loadu_ps(ptr_A + 4*ldA); \
r5 = _mm512_loadu_ps(ptr_A + 5*ldA); \
r6 = _mm512_loadu_ps(ptr_A + 6*ldA); \
r7 = _mm512_loadu_ps(ptr_A + 7*ldA); \
r8 = _mm512_loadu_ps(ptr_A + 8*ldA); \
r9 = _mm512_loadu_ps(ptr_A + 9*ldA); \
ra = _mm512_loadu_ps(ptr_A + 10*ldA); \
rb = _mm512_loadu_ps(ptr_A + 11*ldA); \
rc = _mm512_loadu_ps(ptr_A + 12*ldA); \
rd = _mm512_loadu_ps(ptr_A + 13*ldA); \
re = _mm512_loadu_ps(ptr_A + 14*ldA); \
rf = _mm512_loadu_ps(ptr_A + 15*ldA); \
\
t0 = _mm512_unpacklo_ps(r0,r1); \
t1 = _mm512_unpackhi_ps(r0,r1); \
t2 = _mm512_unpacklo_ps(r2,r3); \
t3 = _mm512_unpackhi_ps(r2,r3); \
t4 = _mm512_unpacklo_ps(r4,r5); \
t5 = _mm512_unpackhi_ps(r4,r5); \
t6 = _mm512_unpacklo_ps(r6,r7); \
t7 = _mm512_unpackhi_ps(r6,r7); \
t8 = _mm512_unpacklo_ps(r8,r9); \
t9 = _mm512_unpackhi_ps(r8,r9); \
ta = _mm512_unpacklo_ps(ra,rb); \
tb = _mm512_unpackhi_ps(ra,rb); \
tc = _mm512_unpacklo_ps(rc,rd); \
td = _mm512_unpackhi_ps(rc,rd); \
te = _mm512_unpacklo_ps(re,rf); \
tf = _mm512_unpackhi_ps(re,rf); \
\
{ const __m512d td1 = _mm512_castps_pd(t0), td2 = _mm512_castps_pd(t2); \
r0 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
r1 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
{ const __m512d td1 = _mm512_castps_pd(t1), td2 = _mm512_castps_pd(t3); \
r2 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
r3 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
{ const __m512d td1 = _mm512_castps_pd(t4), td2 = _mm512_castps_pd(t6); \
r4 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
r5 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
{ const __m512d td1 = _mm512_castps_pd(t5), td2 = _mm512_castps_pd(t7); \
r6 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
r7 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
{ const __m512d td1 = _mm512_castps_pd(t8), td2 = _mm512_castps_pd(ta); \
r8 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
r9 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
{ const __m512d td1 = _mm512_castps_pd(t9), td2 = _mm512_castps_pd(tb); \
ra = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
rb = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
{ const __m512d td1 = _mm512_castps_pd(tc), td2 = _mm512_castps_pd(te); \
rc = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
rd = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
{ const __m512d td1 = _mm512_castps_pd(td), td2 = _mm512_castps_pd(tf); \
re = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
rf = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \
\
t0 = _mm512_shuffle_f32x4(r0, r4, 0x88); \
t1 = _mm512_shuffle_f32x4(r1, r5, 0x88); \
t2 = _mm512_shuffle_f32x4(r2, r6, 0x88); \
t3 = _mm512_shuffle_f32x4(r3, r7, 0x88); \
t4 = _mm512_shuffle_f32x4(r0, r4, 0xdd); \
t5 = _mm512_shuffle_f32x4(r1, r5, 0xdd); \
t6 = _mm512_shuffle_f32x4(r2, r6, 0xdd); \
t7 = _mm512_shuffle_f32x4(r3, r7, 0xdd); \
t8 = _mm512_shuffle_f32x4(r8, rc, 0x88); \
t9 = _mm512_shuffle_f32x4(r9, rd, 0x88); \
ta = _mm512_shuffle_f32x4(ra, re, 0x88); \
tb = _mm512_shuffle_f32x4(rb, rf, 0x88); \
tc = _mm512_shuffle_f32x4(r8, rc, 0xdd); \
td = _mm512_shuffle_f32x4(r9, rd, 0xdd); \
te = _mm512_shuffle_f32x4(ra, re, 0xdd); \
tf = _mm512_shuffle_f32x4(rb, rf, 0xdd); \
\
r0 = _mm512_shuffle_f32x4(t0, t8, 0x88); \
r1 = _mm512_shuffle_f32x4(t1, t9, 0x88); \
r2 = _mm512_shuffle_f32x4(t2, ta, 0x88); \
r3 = _mm512_shuffle_f32x4(t3, tb, 0x88); \
r4 = _mm512_shuffle_f32x4(t4, tc, 0x88); \
r5 = _mm512_shuffle_f32x4(t5, td, 0x88); \
r6 = _mm512_shuffle_f32x4(t6, te, 0x88); \
r7 = _mm512_shuffle_f32x4(t7, tf, 0x88); \
r8 = _mm512_shuffle_f32x4(t0, t8, 0xdd); \
r9 = _mm512_shuffle_f32x4(t1, t9, 0xdd); \
ra = _mm512_shuffle_f32x4(t2, ta, 0xdd); \
rb = _mm512_shuffle_f32x4(t3, tb, 0xdd); \
rc = _mm512_shuffle_f32x4(t4, tc, 0xdd); \
rd = _mm512_shuffle_f32x4(t5, td, 0xdd); \
re = _mm512_shuffle_f32x4(t6, te, 0xdd); \
rf = _mm512_shuffle_f32x4(t7, tf, 0xdd); \
\
_mm512_storeu_ps(ptr_B + 0*ldB, r0); \
_mm512_storeu_ps(ptr_B + 1*ldB, r1); \
_mm512_storeu_ps(ptr_B + 2*ldB, r2); \
_mm512_storeu_ps(ptr_B + 3*ldB, r3); \
_mm512_storeu_ps(ptr_B + 4*ldB, r4); \
_mm512_storeu_ps(ptr_B + 5*ldB, r5); \
_mm512_storeu_ps(ptr_B + 6*ldB, r6); \
_mm512_storeu_ps(ptr_B + 7*ldB, r7); \
_mm512_storeu_ps(ptr_B + 8*ldB, r8); \
_mm512_storeu_ps(ptr_B + 9*ldB, r9); \
_mm512_storeu_ps(ptr_B + 10*ldB, ra); \
_mm512_storeu_ps(ptr_B + 11*ldB, rb); \
_mm512_storeu_ps(ptr_B + 12*ldB, rc); \
_mm512_storeu_ps(ptr_B + 13*ldB, rd); \
_mm512_storeu_ps(ptr_B + 14*ldB, re); \
_mm512_storeu_ps(ptr_B + 15*ldB, rf); \
}
#define TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_A, ldA, ptr_B, ldB) { \
__m512 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; \
__m512 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; \
__m512i vload_1 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A))); \
vload_1 = _mm512_inserti32x8(vload_1, _mm256_loadu_si256((const __m256i*)(ptr_A + ldA)), 1); \
EXPAND_BFLOAT16(vload_1, r0, r1);{ \
__m512i vload_2 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 2*ldA))); \
vload_2 = _mm512_inserti32x8(vload_2, _mm256_loadu_si256((const __m256i*)(ptr_A + 3*ldA)), 1); \
EXPAND_BFLOAT16(vload_2, r2, r3);{ \
__m512i vload_3 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 4*ldA))); \
vload_3 = _mm512_inserti32x8(vload_3, _mm256_loadu_si256((const __m256i*)(ptr_A + 5*ldA)), 1); \
EXPAND_BFLOAT16(vload_3, r4, r5);{ \
__m512i vload_4 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 6*ldA))); \
vload_4 = _mm512_inserti32x8(vload_4, _mm256_loadu_si256((const __m256i*)(ptr_A + 7*ldA)), 1); \
EXPAND_BFLOAT16(vload_4, r6, r7);{ \
__m512i vload_5 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 8*ldA))); \
vload_5 = _mm512_inserti32x8(vload_5, _mm256_loadu_si256((const __m256i*)(ptr_A + 9*ldA)), 1); \
EXPAND_BFLOAT16(vload_5, r8, r9);{ \
__m512i vload_6 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 10*ldA))); \
vload_6 = _mm512_inserti32x8(vload_6, _mm256_loadu_si256((const __m256i*)(ptr_A + 11*ldA)), 1); \
EXPAND_BFLOAT16(vload_6, ra, rb);{ \
__m512i vload_7 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 12*ldA))); \
vload_7 = _mm512_inserti32x8(vload_7, _mm256_loadu_si256((const __m256i*)(ptr_A + 13*ldA)), 1); \
EXPAND_BFLOAT16(vload_7, rc, rd);{ \
__m512i vload_8 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 14*ldA))); \
vload_8 = _mm512_inserti32x8(vload_8, _mm256_loadu_si256((const __m256i*)(ptr_A + 15*ldA)), 1); \
EXPAND_BFLOAT16(vload_8, re, rf); \
\
t0 = _mm512_unpacklo_ps(r0,r1); \
t1 = _mm512_unpackhi_ps(r0,r1); \
t2 = _mm512_unpacklo_ps(r2,r3); \
t3 = _mm512_unpackhi_ps(r2,r3); \
t4 = _mm512_unpacklo_ps(r4,r5); \
t5 = _mm512_unpackhi_ps(r4,r5); \
t6 = _mm512_unpacklo_ps(r6,r7); \
t7 = _mm512_unpackhi_ps(r6,r7); \
t8 = _mm512_unpacklo_ps(r8,r9); \
t9 = _mm512_unpackhi_ps(r8,r9); \
ta = _mm512_unpacklo_ps(ra,rb); \
tb = _mm512_unpackhi_ps(ra,rb); \
tc = _mm512_unpacklo_ps(rc,rd); \
td = _mm512_unpackhi_ps(rc,rd); \
te = _mm512_unpacklo_ps(re,rf); \
tf = _mm512_unpackhi_ps(re,rf); \
\
{ const __m512d td1 = _mm512_castps_pd(t0), td2 = _mm512_castps_pd(t2); \
r0 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
r1 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
{ const __m512d td1 = _mm512_castps_pd(t1), td2 = _mm512_castps_pd(t3); \
r2 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
r3 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
{ const __m512d td1 = _mm512_castps_pd(t4), td2 = _mm512_castps_pd(t6); \
r4 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
r5 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
{ const __m512d td1 = _mm512_castps_pd(t5), td2 = _mm512_castps_pd(t7); \
r6 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
r7 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
{ const __m512d td1 = _mm512_castps_pd(t8), td2 = _mm512_castps_pd(ta); \
r8 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
r9 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
{ const __m512d td1 = _mm512_castps_pd(t9), td2 = _mm512_castps_pd(tb); \
ra = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
rb = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
{ const __m512d td1 = _mm512_castps_pd(tc), td2 = _mm512_castps_pd(te); \
rc = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
rd = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
{ const __m512d td1 = _mm512_castps_pd(td), td2 = _mm512_castps_pd(tf); \
re = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \
rf = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \
\
t0 = _mm512_shuffle_f32x4(r0, r4, 0x88); \
t1 = _mm512_shuffle_f32x4(r1, r5, 0x88); \
t2 = _mm512_shuffle_f32x4(r2, r6, 0x88); \
t3 = _mm512_shuffle_f32x4(r3, r7, 0x88); \
t4 = _mm512_shuffle_f32x4(r0, r4, 0xdd); \
t5 = _mm512_shuffle_f32x4(r1, r5, 0xdd); \
t6 = _mm512_shuffle_f32x4(r2, r6, 0xdd); \
t7 = _mm512_shuffle_f32x4(r3, r7, 0xdd); \
t8 = _mm512_shuffle_f32x4(r8, rc, 0x88); \
t9 = _mm512_shuffle_f32x4(r9, rd, 0x88); \
ta = _mm512_shuffle_f32x4(ra, re, 0x88); \
tb = _mm512_shuffle_f32x4(rb, rf, 0x88); \
tc = _mm512_shuffle_f32x4(r8, rc, 0xdd); \
td = _mm512_shuffle_f32x4(r9, rd, 0xdd); \
te = _mm512_shuffle_f32x4(ra, re, 0xdd); \
tf = _mm512_shuffle_f32x4(rb, rf, 0xdd); \
\
r0 = _mm512_shuffle_f32x4(t0, t8, 0x88); \
r1 = _mm512_shuffle_f32x4(t1, t9, 0x88); \
r2 = _mm512_shuffle_f32x4(t2, ta, 0x88); \
r3 = _mm512_shuffle_f32x4(t3, tb, 0x88); \
r4 = _mm512_shuffle_f32x4(t4, tc, 0x88); \
r5 = _mm512_shuffle_f32x4(t5, td, 0x88); \
r6 = _mm512_shuffle_f32x4(t6, te, 0x88); \
r7 = _mm512_shuffle_f32x4(t7, tf, 0x88); \
r8 = _mm512_shuffle_f32x4(t0, t8, 0xdd); \
r9 = _mm512_shuffle_f32x4(t1, t9, 0xdd); \
ra = _mm512_shuffle_f32x4(t2, ta, 0xdd); \
rb = _mm512_shuffle_f32x4(t3, tb, 0xdd); \
rc = _mm512_shuffle_f32x4(t4, tc, 0xdd); \
rd = _mm512_shuffle_f32x4(t5, td, 0xdd); \
re = _mm512_shuffle_f32x4(t6, te, 0xdd); \
rf = _mm512_shuffle_f32x4(t7, tf, 0xdd); \
\
_mm512_storeu_ps(ptr_B + 0*ldB, r0); \
_mm512_storeu_ps(ptr_B + 1*ldB, r1); \
_mm512_storeu_ps(ptr_B + 2*ldB, r2); \
_mm512_storeu_ps(ptr_B + 3*ldB, r3); \
_mm512_storeu_ps(ptr_B + 4*ldB, r4); \
_mm512_storeu_ps(ptr_B + 5*ldB, r5); \
_mm512_storeu_ps(ptr_B + 6*ldB, r6); \
_mm512_storeu_ps(ptr_B + 7*ldB, r7); \
_mm512_storeu_ps(ptr_B + 8*ldB, r8); \
_mm512_storeu_ps(ptr_B + 9*ldB, r9); \
_mm512_storeu_ps(ptr_B + 10*ldB, ra); \
_mm512_storeu_ps(ptr_B + 11*ldB, rb); \
_mm512_storeu_ps(ptr_B + 12*ldB, rc); \
_mm512_storeu_ps(ptr_B + 13*ldB, rd); \
_mm512_storeu_ps(ptr_B + 14*ldB, re); \
_mm512_storeu_ps(ptr_B + 15*ldB, rf);}}}}}}} \
}
#define COMPRESS_FP32(v, k, m, cnt) { \
_mm512_mask_compressstoreu_ps(values_ptr + (cnt), m, v); \
{ \
__m256i vk1 = _mm256_set1_epi16((short)(k)); \
__m256i vk2 = _mm256_set1_epi16((short)((k) + 8)); \
__m256i v_idx = _mm256_add_epi32(vk1, _mm256_loadu_si256(&shufmasks2[(m)&0xFF])); \
__m256i v_idx_2 = _mm256_add_epi32(vk2, _mm256_loadu_si256(&shufmasks2[((m)>>8)&0xFF])); \
_mm256_storeu_si256((__m256i *)(colidx_ptr + (cnt)), v_idx); \
cnt = (unsigned short)((cnt) + _mm_popcnt_u32((m)&0xFF)); \
_mm256_storeu_si256((__m256i *)(colidx_ptr + (cnt)), v_idx_2); \
cnt = (unsigned short)((cnt) + _mm_popcnt_u32(((m)>>8)&0xFF)); \
} \
}
#define EXPAND_BFLOAT16(v, vlo_final, vhi_final) { \
const __m512i vlo = _mm512_unpacklo_epi16(vzero, v); \
const __m512i vhi = _mm512_unpackhi_epi16(vzero, v); \
const __m512i permmask1 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0); \
const __m512i permmask2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); \
vlo_final = _mm512_castsi512_ps(_mm512_permutex2var_epi64(vlo, permmask1, vhi)); \
vhi_final = _mm512_castsi512_ps(_mm512_permutex2var_epi64(vlo, permmask2, vhi)); \
}
#define COMPRESS_BFLOAT16(vlo, vhi, v) { \
const __m512i permmask1 = _mm512_set_epi64(13, 12, 9, 8, 5, 4, 1, 0); \
const __m512i permmask2 = _mm512_set_epi64(15, 14, 11, 10, 7, 6, 3, 2); \
const __m512i va = _mm512_castps_si512(vlo), vb = _mm512_castps_si512(vhi); \
const __m512i vtmp1 = _mm512_permutex2var_epi64(va, permmask1, vb); \
const __m512i vtmp2 = _mm512_permutex2var_epi64(va, permmask2, vb); \
const __m512i a = _mm512_srli_epi32(vtmp1, 16), b = _mm512_srli_epi32(vtmp2, 16); \
v = _mm512_packus_epi32(a, b); \
}
#endif
third_party/libxsmm/src/libxsmm_spmdm_end.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#undef SIMD_WIDTH_FP32
#undef SIMDTYPE_FP32
#undef SIMDTYPE_INT32
#undef SIMDMASKTYPE_FP32
#undef _MM_SETZERO_FP32
#undef _MM_SETZERO_INT32
#undef _MM_SET1_FP32
#undef _MM_SET1_INT32
#undef _MM_SET1_INT16
#undef _MM_SET_INT32
#undef _MM_LOAD_FP32
#undef _MM_LOADU_FP32
#undef _MM_LOAD_INT32
#undef _MM_STORE_INT32
#undef _MM_LOADU_INT32
#undef _MM_GATHER_INT32
#undef _MM_GATHER_FP32
#undef _MM_CMPNEQ_FP32
#undef _MM_STORE_FP32
#undef _MM_STOREU_FP32
#undef _MM_ADD_FP32
#undef _MM_FMADD_FP32
#undef _MM_MUL_FP32
#undef _MM_PREFETCH
#undef TRANSPOSE_SIMD_WIDTH_KERNEL
#undef TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16
#undef COMPRESS_FP32
#undef EXPAND_BFLOAT16
#undef COMPRESS_BFLOAT16
#undef num_regs
Prev
1
…
10
11
12
13
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment