/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved.                      *
* This file is part of the LIBXSMM library.                                   *
*                                                                             *
* For information on the license, see the LICENSE file.                       *
* Further information: https://github.com/hfp/libxsmm/                        *
* SPDX-License-Identifier: BSD-3-Clause                                       *
******************************************************************************/
/* Nadathur Satish, Hans Pabst (Intel Corp.)
******************************************************************************/

#define SIMD_WIDTH_FP32 (1)
#define SIMDTYPE_FP32 float
#define SIMDTYPE_INT32 int
#define SIMDMASKTYPE_FP32 int
#define _MM_SETZERO_FP32() (0)
#define _MM_SETZERO_INT32() (0)
#define _MM_SET1_FP32(x) (x)
#define _MM_SET1_INT32(x) (x)
#define _MM_SET1_INT16 (x)
#define _MM_LOAD_FP32(x) (*(x))
#define _MM_LOADU_FP32(x) (*(x))
#define _MM_LOAD_INT32(x) (*(x))
#define _MM_STORE_INT32(x,y) ((*(x)) = (y))
#define _MM_LOADU_INT32(x) (*(x))
#define _MM_GATHER_FP32(Addr, idx, scale) (*(Addr + (idx)))
#define _MM_CMPNEQ_FP32(v1,v2) (LIBXSMM_FEQ(v1, v2) ? 0 : 1)
#define _MM_STORE_FP32(x,y) ((*(x)) = (y))
#define _MM_STOREU_FP32(x,y) ((*(x)) = (y))
#define _MM_ADD_FP32(x,y) ((x) + (y))
#define _MM_FMADD_FP32(x,y,z) (((x)*(y))+(z))
#define _MM_MUL_FP32(x,y) ((x)*(y))
#define _MM_PREFETCH(x, y)
#define TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_A, ldA, ptr_B, ldB) ((*(ptr_B)) = (*(ptr_A)))
#define TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_A, ldA, ptr_B, ldB) { \
  uint16_t restmp = (*(ptr_A)); \
  union { int i; float f; } res; \
  res.i = restmp; \
  res.i <<= 16; \
  (*(ptr_B)) = res.f; \
}

#define COMPRESS_FP32(v, k, m, cnt) if (m) { \
  values_ptr[cnt] = v; \
  colidx_ptr[cnt] = (uint16_t)(k); \
  cnt++; \
}

#define EXPAND_BFLOAT16(v, vlo_final, vhi_final) { \
  union { int i; float f; } vlo_tmp, vhi_tmp; \
  vlo_tmp.i = (v) & 0xFFFF; vlo_tmp.i <<= 16; \
  vlo_final = vlo_tmp.f; \
  vhi_tmp.i = (v) & 0x0000FFFF; \
  vhi_final = vhi_tmp.f; \
}

#define COMPRESS_BFLOAT16(vlo, vhi, v) { \
  union { int i; float f; } vlo_tmp, vhi_tmp; \
  vlo_tmp.f = vlo; \
  v = (vlo_tmp.i >> 16); \
  vhi_tmp.f = vhi; \
  v = v | (vhi_tmp.i & 0xFFFF0000); \
}