Commit c454d419 authored by lisj's avatar lisj
Browse files

删除子模块的gitignore

parent 3359c1f1
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_DNN_FUSEDGROUPNORM_H
#define LIBXSMM_DNN_FUSEDGROUPNORM_H
#include "libxsmm_dnn.h"
#include "libxsmm_dnn_tensor.h"
/** Opaque handles which represents LIBXSMM fusedgroupnorm */
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fusedgroupnorm libxsmm_dnn_fusedgroupnorm;
LIBXSMM_API libxsmm_dnn_fusedgroupnorm* libxsmm_dnn_create_fusedgroupnorm(libxsmm_dnn_fusedgroupnorm_desc fusedgroupnorm_desc, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fusedgroupnorm(const libxsmm_dnn_fusedgroupnorm* handle);
LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout(const libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
LIBXSMM_API size_t libxsmm_dnn_fusedgroupnorm_get_scratch_size(const libxsmm_dnn_fusedgroupnorm* handle, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_bind_scratch(libxsmm_dnn_fusedgroupnorm* handle, const void* scratch);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_release_scratch(libxsmm_dnn_fusedgroupnorm* handle);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_bind_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type);
LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fusedgroupnorm_get_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_release_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_execute_st(libxsmm_dnn_fusedgroupnorm* handle, libxsmm_dnn_compute_kind kind,
/*unsigned*/int start_thread, /*unsigned*/int tid);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_reduce_stats_st(libxsmm_dnn_fusedgroupnorm** handles, int num_handles, libxsmm_dnn_compute_kind kind,
/*unsigned*/int start_thread, /*unsigned*/int tid);
#endif /*LIBXSMM_DNN_FUSEDGROUPNORM_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_DNN_SGD_H
#define LIBXSMM_DNN_SGD_H
#include "libxsmm_dnn.h"
#include "libxsmm_dnn_tensor.h"
/** Opaque handles which represents LIBXSMM optimizer */
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_optimizer libxsmm_dnn_optimizer;
typedef enum libxsmm_dnn_optimizer_type {
LIBXSMM_DNN_OPTIMIZER_SGD = 1
} libxsmm_dnn_optimizer_type;
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_optimizer_desc {
int C; /* number of feature maps */
int K; /* number of feature maps */
int bc;
int bk;
float learning_rate; /* learning rate */
int threads; /* number of threads used */
libxsmm_dnn_optimizer_type opt_type;
libxsmm_dnn_datatype datatype_master; /* datatype used for all input related buffers */
libxsmm_dnn_datatype datatype; /* datatype used for all input related buffers */
libxsmm_dnn_tensor_format filter_format; /* format which is for filter buffers */
} libxsmm_dnn_optimizer_desc;
LIBXSMM_API libxsmm_dnn_optimizer* libxsmm_dnn_create_optimizer(libxsmm_dnn_optimizer_desc optimizer_desc, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_optimizer(const libxsmm_dnn_optimizer* handle);
LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_optimizer_create_tensor_datalayout(const libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
LIBXSMM_API void* libxsmm_dnn_optimizer_get_scratch_ptr (const libxsmm_dnn_optimizer* handle, libxsmm_dnn_err_t* status);
LIBXSMM_API size_t libxsmm_dnn_optimizer_get_scratch_size(const libxsmm_dnn_optimizer* handle, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_bind_scratch(libxsmm_dnn_optimizer* handle, const void* scratch);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_release_scratch(libxsmm_dnn_optimizer* handle);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_bind_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type);
LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_optimizer_get_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_release_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_execute_st(libxsmm_dnn_optimizer* handle, /*unsigned*/int start_thread, /*unsigned*/int tid);
#endif /*LIBXSMM_DNN_SGD_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_DNN_POOLING_H
#define LIBXSMM_DNN_POOLING_H
#include "libxsmm_dnn.h"
#include "libxsmm_dnn_tensor.h"
/** Opaque handles which represents LIBXSMM pooling */
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_pooling libxsmm_dnn_pooling;
typedef enum libxsmm_dnn_pooling_type {
LIBXSMM_DNN_POOLING_MAX = 1,
LIBXSMM_DNN_POOLING_AVG = 2
} libxsmm_dnn_pooling_type;
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_pooling_desc {
int N; /* number of images in mini-batch */
int C; /* number of input feature maps */
int H; /* height of input image */
int W; /* width of input image */
int R; /* kernel height */
int S; /* kernel width */
int u; /* vertical stride */
int v; /* horizontal stride */
int pad_h; /* height of logical padding of input buffer */
int pad_w; /* width of logical padding of input buffer */
int pad_h_in; /* height of physical zero-padding in input buffer */
int pad_w_in; /* width of physical zero-padding in input buffer */
int pad_h_out; /* height of physical zero-padding in output buffer */
int pad_w_out; /* width of physical zero-padding in output buffer */
int threads; /* number of threads used */
libxsmm_dnn_datatype datatype_in; /* datatypes used for all input related buffer */
libxsmm_dnn_datatype datatype_out; /* datatypes used for all output related buffer */
libxsmm_dnn_datatype datatype_mask; /* datatypes used for the masks */
libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */
libxsmm_dnn_pooling_type pooling_type; /* type of pooling operation */
} libxsmm_dnn_pooling_desc;
LIBXSMM_API libxsmm_dnn_pooling* libxsmm_dnn_create_pooling(libxsmm_dnn_pooling_desc pooling_desc, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_pooling(const libxsmm_dnn_pooling* handle);
LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_pooling_create_tensor_datalayout(const libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
LIBXSMM_API size_t libxsmm_dnn_pooling_get_scratch_size(const libxsmm_dnn_pooling* handle, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_bind_scratch(libxsmm_dnn_pooling* handle, const void* scratch);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_release_scratch(libxsmm_dnn_pooling* handle);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_bind_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type);
LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_pooling_get_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_release_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_execute_st(libxsmm_dnn_pooling* handle, libxsmm_dnn_compute_kind kind,
/*unsigned*/int start_thread, /*unsigned*/int tid);
#endif /*LIBXSMM_DNN_POOLING_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke, Kunal Banerjee (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_DNN_RNNCELL_H
#define LIBXSMM_DNN_RNNCELL_H
#include "libxsmm_dnn.h"
#include "libxsmm_dnn_tensor.h"
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_rnncell libxsmm_dnn_rnncell;
/** Type of algorithm used for convolutions. */
typedef enum libxsmm_dnn_rnncell_type {
/** simple RNN cell with ReLU as activation function */
LIBXSMM_DNN_RNNCELL_RNN_RELU,
/** simple RNN cell with sigmoid as activation function */
LIBXSMM_DNN_RNNCELL_RNN_SIGMOID,
/** simple RNN cell with tanh as activation function */
LIBXSMM_DNN_RNNCELL_RNN_TANH,
/** LSTM cell */
LIBXSMM_DNN_RNNCELL_LSTM,
/** GRU cell */
LIBXSMM_DNN_RNNCELL_GRU
} libxsmm_dnn_rnncell_type;
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_rnncell_desc {
int threads;
libxsmm_blasint K; /* number of outputs */
libxsmm_blasint N; /* size of the minibatch */
libxsmm_blasint C; /* number of inputs */
libxsmm_blasint max_T; /* number of time steps */
libxsmm_blasint bk;
libxsmm_blasint bn;
libxsmm_blasint bc;
int use_fwd_fused_impl;
int fwd_block;
int bwdupd_block;
libxsmm_dnn_rnncell_type cell_type; /* cell type RNN ReLU, RNN Sigmoid, RNN Tanh, LSTM, GRU */
libxsmm_dnn_datatype datatype_in; /* datatypes used for all input related buffer */
libxsmm_dnn_datatype datatype_out; /* datatypes used for all output related buffer */
libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */
libxsmm_dnn_tensor_format filter_format; /* format which is for filter buffers */
} libxsmm_dnn_rnncell_desc;
LIBXSMM_API libxsmm_dnn_rnncell* libxsmm_dnn_create_rnncell(libxsmm_dnn_rnncell_desc rnncell_desc, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_rnncell(const libxsmm_dnn_rnncell* handle);
LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_rnncell_create_tensor_datalayout(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
LIBXSMM_API size_t libxsmm_dnn_rnncell_get_scratch_size(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status);
LIBXSMM_API void* libxsmm_dnn_rnncell_get_scratch_ptr (const libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_scratch(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, const void* scratch);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_scratch(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind);
LIBXSMM_API size_t libxsmm_dnn_rnncell_get_internalstate_size(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status);
LIBXSMM_API void* libxsmm_dnn_rnncell_get_internalstate_ptr (const libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_internalstate(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, const void* internalstate);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_internalstate(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_allocate_forget_bias(libxsmm_dnn_rnncell* handle, const float forget_bias);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type);
LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_rnncell_get_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_set_sequence_length( libxsmm_dnn_rnncell* handle, const libxsmm_blasint T );
LIBXSMM_API libxsmm_blasint libxsmm_dnn_rnncell_get_sequence_length( libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status );
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_execute_st(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind,
/*unsigned*/int start_thread, /*unsigned*/int tid);
#endif /*LIBXSMM_DNN_RNNCELL_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_DNN_SOFTMAXLOSS_H
#define LIBXSMM_DNN_SOFTMAXLOSS_H
#include "libxsmm_dnn.h"
#include "libxsmm_dnn_tensor.h"
/** Opaque handles which represents LIBXSMM softmaxloss */
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_softmaxloss libxsmm_dnn_softmaxloss;
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_softmaxloss_desc {
int N; /* number of images in mini-batch */
int C; /* number of input feature maps */
int bn; /* requested N blocking for NCNC format */
int bc; /* requested C blocking for NCNC format */
float loss_weight; /* loss weight */
int threads; /* number of threads used */
libxsmm_dnn_datatype datatype; /* datatype used for all buffers */
libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */
} libxsmm_dnn_softmaxloss_desc;
LIBXSMM_API libxsmm_dnn_softmaxloss* libxsmm_dnn_create_softmaxloss(libxsmm_dnn_softmaxloss_desc softmaxloss_desc, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_softmaxloss(const libxsmm_dnn_softmaxloss* handle);
LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_softmaxloss_create_tensor_datalayout(const libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
LIBXSMM_API void* libxsmm_dnn_softmaxloss_get_scratch_ptr (const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status);
LIBXSMM_API size_t libxsmm_dnn_softmaxloss_get_scratch_size(const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_bind_scratch(libxsmm_dnn_softmaxloss* handle, const void* scratch);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_release_scratch(libxsmm_dnn_softmaxloss* handle);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_bind_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type);
LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_softmaxloss_get_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_release_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_execute_st(libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_compute_kind kind,
/*unsigned*/int start_thread, /*unsigned*/int tid);
LIBXSMM_API float libxsmm_dnn_softmaxloss_get_loss(const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status);
#endif /*LIBXSMM_DNN_SOFTMAXLOSS_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_DNN_TENSOR_H
#define LIBXSMM_DNN_TENSOR_H
#include "libxsmm_typedefs.h"
#include "libxsmm_dnn.h"
/** Opaque handles which represents convolutions and LIBXSMM datatypes */
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_tensor libxsmm_dnn_tensor;
typedef enum libxsmm_dnn_tensor_dimtype {
/** Mini-batch */
LIBXSMM_DNN_TENSOR_DIMTYPE_N,
/** Image Height */
LIBXSMM_DNN_TENSOR_DIMTYPE_H,
/** Image Width */
LIBXSMM_DNN_TENSOR_DIMTYPE_W,
/** channels or input channels */
LIBXSMM_DNN_TENSOR_DIMTYPE_C,
/** output channels */
LIBXSMM_DNN_TENSOR_DIMTYPE_K,
/** kernel height */
LIBXSMM_DNN_TENSOR_DIMTYPE_R,
/** kernel width */
LIBXSMM_DNN_TENSOR_DIMTYPE_S,
/** sequence lenth counter */
LIBXSMM_DNN_TENSOR_DIMTYPE_T,
/** channle group counter */
LIBXSMM_DNN_TENSOR_DIMTYPE_G,
/** general counter */
LIBXSMM_DNN_TENSOR_DIMTYPE_X
} libxsmm_dnn_tensor_dimtype;
/** types of different buffers */
typedef enum libxsmm_dnn_tensor_type {
/** regular input buffer */
LIBXSMM_DNN_REGULAR_INPUT,
/** regular input buffer */
LIBXSMM_DNN_REGULAR_INPUT_ADD,
/** regular input buffer, transpose */
LIBXSMM_DNN_REGULAR_INPUT_TRANS,
/** gradient input buffer */
LIBXSMM_DNN_GRADIENT_INPUT,
/** gradient input buffer */
LIBXSMM_DNN_GRADIENT_INPUT_ADD,
/** regular output buffer */
LIBXSMM_DNN_REGULAR_OUTPUT,
/** gradient output buffer */
LIBXSMM_DNN_GRADIENT_OUTPUT,
/** general input type */
LIBXSMM_DNN_INPUT,
/** general output type */
LIBXSMM_DNN_OUTPUT,
/** general activation type */
LIBXSMM_DNN_ACTIVATION,
/* regular filter */
LIBXSMM_DNN_REGULAR_FILTER,
/* regular filter */
LIBXSMM_DNN_REGULAR_FILTER_TRANS,
/* gradient filter */
LIBXSMM_DNN_GRADIENT_FILTER,
/* master filter */
LIBXSMM_DNN_MASTER_FILTER,
/** general filter type */
LIBXSMM_DNN_FILTER,
/* regular bias */
LIBXSMM_DNN_REGULAR_CHANNEL_BIAS,
/* gradient bias */
LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS,
/* bias */
LIBXSMM_DNN_CHANNEL_BIAS,
/* regular beta */
LIBXSMM_DNN_REGULAR_CHANNEL_BETA,
/* gradient beta */
LIBXSMM_DNN_GRADIENT_CHANNEL_BETA,
/* beta */
LIBXSMM_DNN_CHANNEL_BETA,
/* regular gamma */
LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA,
/* gradient gamma */
LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA,
/* Gamma */
LIBXSMM_DNN_CHANNEL_GAMMA,
/* regular beta */
LIBXSMM_DNN_CHANNEL_EXPECTVAL,
/* regular beta */
LIBXSMM_DNN_CHANNEL_RCPSTDDEV,
/* variance */
LIBXSMM_DNN_CHANNEL_VARIANCE,
/** general bias type */
LIBXSMM_DNN_CHANNEL_SCALAR,
/** Labels */
LIBXSMM_DNN_LABEL,
/** batch stats */
LIBXSMM_DNN_BATCH_STATS,
LIBXSMM_DNN_MAX_STATS_FWD,
LIBXSMM_DNN_MAX_STATS_BWD,
LIBXSMM_DNN_MAX_STATS_UPD,
/** pooling mask */
LIBXSMM_DNN_POOLING_MASK,
/** ReLU mask */
LIBXSMM_DNN_RELU_MASK,
/** general type, if needed might cause API issues in copy in/out API */
LIBXSMM_DNN_TENSOR,
/** regular input buffer */
LIBXSMM_DNN_RNN_REGULAR_INPUT,
/** regular previous cell state buffer */
LIBXSMM_DNN_RNN_REGULAR_CS_PREV,
/** regular previous hidden state buffer */
LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV,
/** regular weight (LSTM: wi, wc, wf, wo) */
LIBXSMM_DNN_RNN_REGULAR_WEIGHT,
/** regular recurrent weight (LSTM: ri, rc, rf, ro) */
LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT,
/** regular weight (LSTM: wi, wc, wf, wo) */
LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS,
/** regular recurrent weight (LSTM: ri, rc, rf, ro) */
LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS,
/** regular bias (LSTM: bi, bc, bf, bo) */
LIBXSMM_DNN_RNN_REGULAR_BIAS,
/** regular output cell state buffer */
LIBXSMM_DNN_RNN_REGULAR_CS,
/** regular hidden state buffer */
LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE,
/** gradient input buffer */
LIBXSMM_DNN_RNN_GRADIENT_INPUT,
/** gradient previous cell state buffer */
LIBXSMM_DNN_RNN_GRADIENT_CS_PREV,
/** gradient previous hidden state buffer */
LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV,
/** gradient weight */
LIBXSMM_DNN_RNN_GRADIENT_WEIGHT,
/** gradient recurrent weight */
LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT,
/** gradient bias */
LIBXSMM_DNN_RNN_GRADIENT_BIAS,
/** gradient output cell state buffer */
LIBXSMM_DNN_RNN_GRADIENT_CS,
/** gradient hidden state buffer */
LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE,
/** internal i buffer */
LIBXSMM_DNN_RNN_INTERNAL_I,
/** internal f buffer */
LIBXSMM_DNN_RNN_INTERNAL_F,
/** internal o buffer */
LIBXSMM_DNN_RNN_INTERNAL_O,
/** internal ci buffer */
LIBXSMM_DNN_RNN_INTERNAL_CI,
/** internal co buffer */
LIBXSMM_DNN_RNN_INTERNAL_CO
} libxsmm_dnn_tensor_type;
/** layout descriptor to allow external data handling
outside of LIBXSMM */
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_tensor_datalayout {
libxsmm_dnn_tensor_dimtype* dim_type;
unsigned int* dim_size;
unsigned int num_dims;
libxsmm_dnn_tensor_format format; /* format of activation buffer */
libxsmm_dnn_datatype datatype; /* data type */
libxsmm_dnn_tensor_type tensor_type; /* tensor type */
} libxsmm_dnn_tensor_datalayout;
/** tensorlayout handling */
LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_duplicate_tensor_datalayout(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_tensor_datalayout(libxsmm_dnn_tensor_datalayout* layout);
LIBXSMM_API unsigned int libxsmm_dnn_compare_tensor_datalayout(const libxsmm_dnn_tensor_datalayout* layout_a, const libxsmm_dnn_tensor_datalayout* layout_b, libxsmm_dnn_err_t* status);
LIBXSMM_API unsigned int libxsmm_dnn_get_tensor_size(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status);
LIBXSMM_API unsigned int libxsmm_dnn_get_tensor_elements(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status);
/** Create and manage buffers, filters and bias (non-NULL if successful) */
LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_link_tensor(const libxsmm_dnn_tensor_datalayout* layout, const void* data, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_link_qtensor(const libxsmm_dnn_tensor_datalayout* layout, const void* data, const unsigned char exp, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_tensor* tensor, const void* data);
LIBXSMM_API void* libxsmm_dnn_get_tensor_data_ptr(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_get_tensor_datalayout(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status);
LIBXSMM_API unsigned char libxsmm_dnn_get_qtensor_scf(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_set_qtensor_scf(libxsmm_dnn_tensor* tensor, const unsigned char scf);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_tensor(const libxsmm_dnn_tensor* tensor);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_zero_tensor(const libxsmm_dnn_tensor* tensor);
/**
* Copy-in/out from a plain format such [N][C][H][W] or [N][H][W][C]
*/
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_copyin_tensor(const libxsmm_dnn_tensor* tensor, const void* data, const libxsmm_dnn_tensor_format in_format);
LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_copyout_tensor(const libxsmm_dnn_tensor* tensor, void* data, const libxsmm_dnn_tensor_format out_format);
#endif /*LIBXSMM_DNN_TENSOR_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_FRONTEND_H
#define LIBXSMM_FRONTEND_H
#include "libxsmm_typedefs.h"
/** Helper macros for eliding prefetch address calculations depending on prefetch scheme. */
#if !defined(_WIN32) && !defined(__CYGWIN__) /* TODO: fully support calling convention */
#if 0 != ((LIBXSMM_PREFETCH) & 2/*AL2*/) \
|| 0 != ((LIBXSMM_PREFETCH) & 8/*AL2_AHEAD*/)
# define LIBXSMM_GEMM_PREFETCH_A(EXPR) (EXPR)
#endif
#if 0 != ((LIBXSMM_PREFETCH) & 4/*BL2_VIA_C*/) \
|| 0 != ((LIBXSMM_PREFETCH) & 16/*BL1*/)
# define LIBXSMM_GEMM_PREFETCH_B(EXPR) (EXPR)
#endif
#endif
/** Secondary helper macros derived from the above group. */
#if defined(LIBXSMM_GEMM_PREFETCH_A)
# define LIBXSMM_NOPREFETCH_A(EXPR)
#else
# define LIBXSMM_NOPREFETCH_A(EXPR) EXPR
# define LIBXSMM_GEMM_PREFETCH_A(EXPR) 0
#endif
#if defined(LIBXSMM_GEMM_PREFETCH_B)
# define LIBXSMM_NOPREFETCH_B(EXPR)
#else
# define LIBXSMM_NOPREFETCH_B(EXPR) EXPR
# define LIBXSMM_GEMM_PREFETCH_B(EXPR) 0
#endif
#if defined(LIBXSMM_GEMM_PREFETCH_C)
# define LIBXSMM_NOPREFETCH_C(EXPR)
#else
# define LIBXSMM_NOPREFETCH_C(EXPR) EXPR
# define LIBXSMM_GEMM_PREFETCH_C(EXPR) 0
#endif
/** MKL_DIRECT_CALL requires to include the MKL interface. */
#if (defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL) || \
(defined(__MKL) && !defined(LIBXSMM_BUILD) && \
(!defined(__BLAS) || (0 != __BLAS))))
# if (0 != LIBXSMM_ILP64 && !defined(MKL_ILP64))
# error "Inconsistent ILP64 configuration detected!"
# endif
# if defined(LIBXSMM_OFFLOAD_BUILD)
# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
# include <mkl.h>
# pragma offload_attribute(pop)
# else
# include <mkl.h>
# endif
#endif
/** __INTEL_MKL__ is needed later to fix some NOTHROW issue. */
#if defined(__MKL) && !defined(__INTEL_MKL__) && defined(NOTHROW)
# if defined(LIBXSMM_OFFLOAD_BUILD)
# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
# include <mkl_version.h>
# pragma offload_attribute(pop)
# else
# include <mkl_version.h>
# endif
#endif
/** Unfortunately calculation of INTEL_MKL_VERSION is not stable over time. */
#if defined(__INTEL_MKL__) && defined(__INTEL_MKL_MINOR__) && defined(__INTEL_MKL_UPDATE__)
# define LIBXSMM_MKL_VERSION3 LIBXSMM_VERSION3(__INTEL_MKL__, __INTEL_MKL_MINOR__, __INTEL_MKL_UPDATE__)
#endif
/** Automatically select a prefetch-strategy (libxsmm_get_gemm_xprefetch, etc.). */
#define LIBXSMM_PREFETCH_AUTO -1
/** Append "_omp" postfix to the given symbol. */
#define LIBXSMM_USEOMP(FUNCTION) LIBXSMM_CONCATENATE(FUNCTION, _omp)
/** Helper macro for BLAS-style prefixes. */
#define LIBXSMM_TPREFIX_NAME(TYPE) LIBXSMM_CONCATENATE(LIBXSMM_TPREFIX_, TYPE)
#define LIBXSMM_TPREFIX(TYPE, FUNCTION) LIBXSMM_CONCATENATE(LIBXSMM_TPREFIX_NAME(TYPE), FUNCTION)
#define LIBXSMM_TPREFIX_doubledouble d
#define LIBXSMM_TPREFIX_floatfloat s
#define LIBXSMM_TPREFIX_shortfloat ws
#define LIBXSMM_TPREFIX_shortint wi
#define LIBXSMM_TPREFIX_libxsmm_bfloat16float bs
/** Defaults if only the input type is specified. */
#define LIBXSMM_TPREFIX_double LIBXSMM_TPREFIX_doubledouble
#define LIBXSMM_TPREFIX_float LIBXSMM_TPREFIX_floatfloat
#define LIBXSMM_TPREFIX_short LIBXSMM_TPREFIX_shortint
#define LIBXSMM_GEMM_XFLAGS(ITYPE, OTYPE) LIBXSMM_CONCATENATE(LIBXSMM_GEMM_XFLAGS_, ITYPE) /* ignore OTYPE for now */
#define LIBXSMM_GEMM_XFLAGS_double 0
#define LIBXSMM_GEMM_XFLAGS_float 0
#define LIBXSMM_GEMM_XFLAGS_libxsmm_bfloat16 LIBXSMM_GEMM_FLAG_VNNI_A
#define LIBXSMM_GEMM_XFLAGS_int 0
#define LIBXSMM_GEMM_XFLAGS_short 0
/** Construct symbol name from a given real type name (float, double and short). */
#define LIBXSMM_BLAS_FNTYPE(TYPE, KIND) LIBXSMM_CONCATENATE3(libxsmm_, LIBXSMM_TPREFIX(TYPE, KIND), _function)
#define LIBXSMM_MMFUNCTION_TYPE(TYPE) LIBXSMM_CONCATENATE(libxsmm_, LIBXSMM_TPREFIX(TYPE, mmfunction))
#define LIBXSMM_MMDISPATCH_SYMBOL(TYPE) LIBXSMM_CONCATENATE(libxsmm_, LIBXSMM_TPREFIX(TYPE, mmdispatch))
#define LIBXSMM_XBLAS_SYMBOL(TYPE) LIBXSMM_CONCATENATE(libxsmm_blas_, LIBXSMM_TPREFIX(TYPE, gemm))
#define LIBXSMM_XGEMM_SYMBOL(TYPE) LIBXSMM_CONCATENATE(libxsmm_, LIBXSMM_TPREFIX(TYPE, gemm))
#define LIBXSMM_YGEMM_SYMBOL(TYPE) LIBXSMM_USEOMP(LIBXSMM_XGEMM_SYMBOL(TYPE))
#define LIBXSMM_BLAS_SYMBOL(TYPE, KIND) LIBXSMM_FSYMBOL(LIBXSMM_TPREFIX(TYPE, KIND))
#define LIBXSMM_CBLAS_SYMBOL LIBXSMM_TPREFIX
#define LIBXSMM_BLAS_DECL(TYPE, KIND, DECL) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_, LIBXSMM_TPREFIX(TYPE, KIND))(DECL)
#if !defined(MKL_DIRECT_CALL_SEQ) && !defined(MKL_DIRECT_CALL)
# define LIBXSMM_BLAS_dgemm(DECL) DECL;
# define LIBXSMM_BLAS_sgemm(DECL) DECL;
# define LIBXSMM_BLAS_dgemv(DECL) DECL;
# define LIBXSMM_BLAS_sgemv(DECL) DECL;
#else
# define LIBXSMM_BLAS_dgemm
# define LIBXSMM_BLAS_sgemm
# define LIBXSMM_BLAS_dgemv
# define LIBXSMM_BLAS_sgemv
#endif
/* Construct prefix names, function type or dispatch function from given input and output types. */
#define LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) LIBXSMM_MMFUNCTION_TYPE(LIBXSMM_CONCATENATE(ITYPE, OTYPE))
#define LIBXSMM_MMDISPATCH_SYMBOL2(ITYPE, OTYPE) LIBXSMM_MMDISPATCH_SYMBOL(LIBXSMM_CONCATENATE(ITYPE, OTYPE))
#define LIBXSMM_TPREFIX_NAME2(ITYPE, OTYPE) LIBXSMM_TPREFIX_NAME(LIBXSMM_CONCATENATE(ITYPE, OTYPE))
#define LIBXSMM_TPREFIX2(ITYPE, OTYPE, FUNCTION) LIBXSMM_TPREFIX(LIBXSMM_CONCATENATE(ITYPE, OTYPE), FUNCTION)
/** Helper macro for comparing selected types. */
#define LIBXSMM_EQUAL(T1, T2) LIBXSMM_CONCATENATE3(LIBXSMM_EQUAL_, T1, T2)
#define LIBXSMM_EQUAL_floatfloat 1
#define LIBXSMM_EQUAL_doubledouble 1
#define LIBXSMM_EQUAL_floatdouble 0
#define LIBXSMM_EQUAL_doublefloat 0
#define LIBXSMM_EQUAL_shortdouble 0
#define LIBXSMM_EQUAL_shortfloat 0
#if defined(LIBXSMM_BLAS_CONST)
# undef LIBXSMM_BLAS_CONST
# define LIBXSMM_BLAS_CONST const
#elif defined(OPENBLAS_CONST)
# define LIBXSMM_BLAS_CONST OPENBLAS_CONST
#elif defined(LIBXSMM_BLAS_NONCONST) || defined(__OPENBLAS) || defined(__OPENBLAS77)
# define LIBXSMM_BLAS_CONST
#else
# define LIBXSMM_BLAS_CONST const
#endif
#if !defined(LIBXSMM_NO_BLAS)
# if (!defined(__BLAS) || (0 != __BLAS))
# define LIBXSMM_NO_BLAS 0
# define LIBXSMM_BLAS 1
# else
# define LIBXSMM_NO_BLAS 1
# define LIBXSMM_BLAS 0
# endif
#endif
#if defined(__BLAS) && (1 == __BLAS)
# if defined(__OPENBLAS)
LIBXSMM_EXTERN void openblas_set_num_threads(int num_threads);
# define LIBXSMM_BLAS_INIT openblas_set_num_threads(1);
# endif
#endif
#if !defined(LIBXSMM_BLAS_INIT)
# define LIBXSMM_BLAS_INIT
#endif
#if defined(LIBXSMM_BUILD)
# if defined(LIBXSMM_BUILD_EXT) && !defined(__STATIC)
# define LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_APIEXT
# elif defined(LIBXSMM_NO_BLAS) && (1 == LIBXSMM_NO_BLAS)
# define LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_API
# endif
#endif
#if !defined(LIBXSMM_BLAS_SYMBOL_VISIBILITY)
# define LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_EXTERN LIBXSMM_VISIBILITY_IMPORT LIBXSMM_RETARGETABLE
#endif
#if defined(NOTHROW)
# define LIBXSMM_BLAS_NOTHROW NOTHROW
#else
# define LIBXSMM_BLAS_NOTHROW LIBXSMM_NOEXCEPT
#endif
#define LIBXSMM_BLAS_NOEXCEPT(KIND) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_NOEXCEPT_, KIND)
#if defined(LIBXSMM_MKL_VERSION3) && (LIBXSMM_VERSION3(2020, 0, 2) <= LIBXSMM_MKL_VERSION3)
# define LIBXSMM_BLAS_NOEXCEPT_gemm_batch LIBXSMM_BLAS_NOTHROW
#else
# define LIBXSMM_BLAS_NOEXCEPT_gemm_batch
#endif
#define LIBXSMM_BLAS_NOEXCEPT_gemm LIBXSMM_BLAS_NOTHROW
#define LIBXSMM_BLAS_NOEXCEPT_gemv LIBXSMM_BLAS_NOTHROW
#define LIBXSMM_BLAS_SYMBOL_SIGNATURE_gemm_batch(CONST_STAR, STAR, TYPE) char CONST_STAR, char CONST_STAR, \
libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, \
TYPE CONST_STAR, TYPE CONST_STAR STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR STAR, libxsmm_blasint CONST_STAR, \
TYPE CONST_STAR, TYPE STAR STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR
#define LIBXSMM_BLAS_SYMBOL_SIGNATURE_gemm(CONST_STAR, STAR, TYPE) char CONST_STAR, char CONST_STAR, \
libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR, TYPE CONST_STAR, libxsmm_blasint CONST_STAR, \
TYPE CONST_STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR, TYPE STAR, libxsmm_blasint CONST_STAR
#define LIBXSMM_BLAS_SYMBOL_SIGNATURE_gemv(CONST_STAR, STAR, TYPE) char CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, \
TYPE CONST_STAR, TYPE CONST_STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR, libxsmm_blasint CONST_STAR, \
TYPE CONST_STAR, TYPE STAR, libxsmm_blasint CONST_STAR
#define LIBXSMM_BLAS_SYMBOL_SIGNATURE(CONST_STAR, STAR, TYPE, KIND) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_SYMBOL_SIGNATURE_, KIND)(CONST_STAR, STAR, TYPE)
#define LIBXSMM_BLAS_SYMBOL_FDECL(CONST_STAR, STAR, TYPE, KIND) LIBXSMM_BLAS_SYMBOL_VISIBILITY \
void LIBXSMM_BLAS_SYMBOL(TYPE, KIND)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(CONST_STAR, STAR, TYPE, KIND)) LIBXSMM_BLAS_NOEXCEPT(KIND)
#define LIBXSMM_BLAS_SYMBOL_CDECL(CONST_STAR, STAR, TYPE, KIND) LIBXSMM_BLAS_SYMBOL_VISIBILITY \
void LIBXSMM_CBLAS_SYMBOL(TYPE, KIND)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(CONST_STAR, STAR, TYPE, KIND)) LIBXSMM_BLAS_NOEXCEPT(KIND)
#if (0 != LIBXSMM_BLAS) /* BLAS available */
# define LIBXSMM_BLAS_SYMBOL_DECL(TYPE, KIND) LIBXSMM_BLAS_DECL(TYPE, KIND, LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, TYPE, KIND))
#else
# define LIBXSMM_BLAS_SYMBOL_DECL(TYPE, KIND)
#endif
/** Helper macro consolidating the transpose requests into a set of flags. */
#define LIBXSMM_GEMM_FLAGS(TRANSA, TRANSB) /* check for N/n rather than T/t since C/c is also valid! */ \
((('n' == (TRANSA) || *"N" == (TRANSA)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_A) \
| (('n' == (TRANSB) || *"N" == (TRANSB)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B))
/** Helper macro consolidating CBLAS transpose requests into a set of flags. */
#define LIBXSMM_GEMM_CFLAGS(TRANSA, TRANSB) /* check for N/n rather than T/t since C/c is also valid! */ \
((CblasNoTrans == (TRANSA) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_A) \
| (CblasNoTrans == (TRANSB) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B))
/** Helper macro consolidating the transpose requests into a set of flags. */
#define LIBXSMM_GEMM_VNNI_FLAGS(TRANSA, TRANSB, VNNIA, VNNIB) /* check for N/n rather than T/t since C/c is also valid! */ \
((('n' == (TRANSA) || *"N" == (TRANSA)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_A) \
| (('n' == (TRANSB) || *"N" == (TRANSB)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B) \
| (('n' == (VNNIA) || *"N" == (VNNIA)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_VNNI_A) \
| (('n' == (VNNIB) || *"N" == (VNNIB)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_VNNI_B))
/** Helper macro allowing NULL-requests (transposes) supplied by some default. */
#define LIBXSMM_GEMM_PFLAGS(TRANSA, TRANSB, DEFAULT) LIBXSMM_GEMM_FLAGS( \
NULL != ((const void*)(TRANSA)) ? (*(const char*)(TRANSA)) : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & (DEFAULT)) ? 'n' : 't'), \
NULL != ((const void*)(TRANSB)) ? (*(const char*)(TRANSB)) : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & (DEFAULT)) ? 'n' : 't')) \
| (~(LIBXSMM_GEMM_FLAG_TRANS_A | LIBXSMM_GEMM_FLAG_TRANS_B) & (DEFAULT))
/** Inlinable GEMM exercising the compiler's code generation (macro template). TODO: only NN is supported and SP/DP matrices. */
#define LIBXSMM_INLINE_XGEMM(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) { \
/* Use 'n' (instead of 'N') avoids warning about "no macro replacement within a character constant". */ \
const char libxsmm_inline_xgemm_transa_ = (char)(NULL != ((void*)(TRANSA)) ? (*(const char*)(TRANSA)) : \
(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & LIBXSMM_FLAGS) ? 'n' : 't')); \
const char libxsmm_inline_xgemm_transb_ = (char)(NULL != ((void*)(TRANSB)) ? (*(const char*)(TRANSB)) : \
(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & LIBXSMM_FLAGS) ? 'n' : 't')); \
const libxsmm_blasint libxsmm_inline_xgemm_m_ = *(const libxsmm_blasint*)(M); /* must be specified */ \
const libxsmm_blasint libxsmm_inline_xgemm_k_ = (NULL != ((void*)(K)) ? (*(const libxsmm_blasint*)(K)) : libxsmm_inline_xgemm_m_); \
const libxsmm_blasint libxsmm_inline_xgemm_n_ = (NULL != ((void*)(N)) ? (*(const libxsmm_blasint*)(N)) : libxsmm_inline_xgemm_k_); \
const libxsmm_blasint libxsmm_inline_xgemm_lda_ = (NULL != ((void*)(LDA)) ? (*(const libxsmm_blasint*)(LDA)) : \
(('n' == libxsmm_inline_xgemm_transa_ || *"N" == libxsmm_inline_xgemm_transa_) ? libxsmm_inline_xgemm_m_ : libxsmm_inline_xgemm_k_)); \
const libxsmm_blasint libxsmm_inline_xgemm_ldb_ = (NULL != ((void*)(LDB)) ? (*(const libxsmm_blasint*)(LDB)) : \
(('n' == libxsmm_inline_xgemm_transb_ || *"N" == libxsmm_inline_xgemm_transb_) ? libxsmm_inline_xgemm_k_ : libxsmm_inline_xgemm_n_)); \
const libxsmm_blasint libxsmm_inline_xgemm_ldc_ = (NULL != ((void*)(LDC)) ? (*(const libxsmm_blasint*)(LDC)) : libxsmm_inline_xgemm_m_); \
const OTYPE libxsmm_inline_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \
const OTYPE libxsmm_inline_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \
libxsmm_blasint libxsmm_inline_xgemm_ni_, libxsmm_inline_xgemm_mi_ = 0, libxsmm_inline_xgemm_ki_; /* loop induction variables */ \
LIBXSMM_ASSERT('n' == libxsmm_inline_xgemm_transa_ || *"N" == libxsmm_inline_xgemm_transa_); \
LIBXSMM_ASSERT('n' == libxsmm_inline_xgemm_transb_ || *"N" == libxsmm_inline_xgemm_transb_); \
LIBXSMM_PRAGMA_SIMD \
for (libxsmm_inline_xgemm_mi_ = 0; libxsmm_inline_xgemm_mi_ < libxsmm_inline_xgemm_m_; ++libxsmm_inline_xgemm_mi_) { \
LIBXSMM_PRAGMA_LOOP_COUNT(1, LIBXSMM_CONFIG_MAX_DIM, LIBXSMM_CONFIG_AVG_DIM) \
for (libxsmm_inline_xgemm_ki_ = 0; libxsmm_inline_xgemm_ki_ < libxsmm_inline_xgemm_k_; ++libxsmm_inline_xgemm_ki_) { \
LIBXSMM_PRAGMA_UNROLL \
for (libxsmm_inline_xgemm_ni_ = 0; libxsmm_inline_xgemm_ni_ < libxsmm_inline_xgemm_n_; ++libxsmm_inline_xgemm_ni_) { \
((OTYPE*)(C))[libxsmm_inline_xgemm_ni_*libxsmm_inline_xgemm_ldc_+libxsmm_inline_xgemm_mi_] \
= ((const ITYPE*)(B))[libxsmm_inline_xgemm_ni_*libxsmm_inline_xgemm_ldb_+libxsmm_inline_xgemm_ki_] * \
(((const ITYPE*)(A))[libxsmm_inline_xgemm_ki_*libxsmm_inline_xgemm_lda_+libxsmm_inline_xgemm_mi_] * libxsmm_inline_xgemm_alpha_) \
+ ((const OTYPE*)(C))[libxsmm_inline_xgemm_ni_*libxsmm_inline_xgemm_ldc_+libxsmm_inline_xgemm_mi_] * libxsmm_inline_xgemm_beta_; \
} \
} \
} \
}
#if (defined(LIBXSMM_INIT) || defined(LIBXSMM_CTOR))
# undef LIBXSMM_INIT
# define LIBXSMM_INIT LIBXSMM_ASSERT_MSG(1 < libxsmm_ninit, "LIBXSMM is not initialized");
# define LIBXSMM_INIT_COMPLETED
#else
# define LIBXSMM_INIT if (2 > libxsmm_ninit) libxsmm_init();
#endif
/** Map to appropriate BLAS function (or fallback). The mapping is used, e.g., inside of LIBXSMM_BLAS_XGEMM. */
#define LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, FUNCTION) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_FUNCTION_, LIBXSMM_TPREFIX2(ITYPE, OTYPE, FUNCTION))
#if (0 != LIBXSMM_BLAS) /* Helper macro to eventually (if defined) call libxsmm_init */
# if defined(LIBXSMM_INIT_COMPLETED)
# define LIBXSMM_BLAS_FUNCTION_dgemm_batch libxsmm_original_dgemm_batch_function
# define LIBXSMM_BLAS_FUNCTION_sgemm_batch libxsmm_original_sgemm_batch_function
# define LIBXSMM_BLAS_FUNCTION_dgemm libxsmm_original_dgemm_function
# define LIBXSMM_BLAS_FUNCTION_sgemm libxsmm_original_sgemm_function
# define LIBXSMM_BLAS_FUNCTION_dgemv libxsmm_original_dgemv_function
# define LIBXSMM_BLAS_FUNCTION_sgemv libxsmm_original_sgemv_function
# else
# define LIBXSMM_BLAS_FUNCTION_dgemm_batch libxsmm_original_dgemm_batch()
# define LIBXSMM_BLAS_FUNCTION_sgemm_batch libxsmm_original_sgemm_batch()
# define LIBXSMM_BLAS_FUNCTION_dgemm libxsmm_original_dgemm()
# define LIBXSMM_BLAS_FUNCTION_sgemm libxsmm_original_sgemm()
# define LIBXSMM_BLAS_FUNCTION_dgemv libxsmm_original_dgemv()
# define LIBXSMM_BLAS_FUNCTION_sgemv libxsmm_original_sgemv()
# endif
#else /* no BLAS */
# define LIBXSMM_BLAS_FUNCTION_dgemm_batch libxsmm_blas_error("dgemm_batch")
# define LIBXSMM_BLAS_FUNCTION_sgemm_batch libxsmm_blas_error("sgemm_batch")
# define LIBXSMM_BLAS_FUNCTION_dgemm libxsmm_blas_error("dgemm")
# define LIBXSMM_BLAS_FUNCTION_sgemm libxsmm_blas_error("sgemm")
# define LIBXSMM_BLAS_FUNCTION_dgemv libxsmm_blas_error("dgemv")
# define LIBXSMM_BLAS_FUNCTION_sgemv libxsmm_blas_error("sgemv")
#endif
/** Low-precision (BLAS-like) function symbols. */
#define LIBXSMM_BLAS_FUNCTION_wigemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
LIBXSMM_INLINE_XGEMM(short, int, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#define LIBXSMM_BLAS_FUNCTION_bsgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
LIBXSMM_INLINE_XGEMM(libxsmm_bfloat16, float, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
/** Short-cut macros to construct desired BLAS function symbol. */
#define LIBXSMM_BLAS_FUNCTION1(TYPE, FUNCTION) LIBXSMM_BLAS_FUNCTION(TYPE, TYPE, FUNCTION)
#define LIBXSMM_GEMM_BATCH_SYMBOL(TYPE) LIBXSMM_BLAS_FUNCTION1(TYPE, gemm_batch)
#define LIBXSMM_GEMM_SYMBOL(TYPE) LIBXSMM_BLAS_FUNCTION1(TYPE, gemm)
#define LIBXSMM_GEMV_SYMBOL(TYPE) LIBXSMM_BLAS_FUNCTION1(TYPE, gemv)
/** BLAS-based GEMM supplied by the linked LAPACK/BLAS library (macro template). */
#define LIBXSMM_BLAS_XGEMM(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) { \
/* Use 'n' (instead of 'N') avoids warning about "no macro replacement within a character constant". */ \
const char libxsmm_blas_xgemm_transa_ = (char)(NULL != ((void*)(TRANSA)) ? (*(const char*)(TRANSA)) : \
(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & LIBXSMM_FLAGS) ? 'n' : 't')); \
const char libxsmm_blas_xgemm_transb_ = (char)(NULL != ((void*)(TRANSB)) ? (*(const char*)(TRANSB)) : \
(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & LIBXSMM_FLAGS) ? 'n' : 't')); \
const libxsmm_blasint *const libxsmm_blas_xgemm_k_ = (NULL != ((void*)(K)) ? (K) : (M)); \
const libxsmm_blasint *const libxsmm_blas_xgemm_n_ = (NULL != ((void*)(N)) ? (N) : libxsmm_blas_xgemm_k_); \
const libxsmm_blasint libxsmm_blas_xgemm_lda_ = LIBXSMM_MAX(NULL != ((void*)(LDA)) ? *(LDA) : \
*(('n' == libxsmm_blas_xgemm_transa_ || *"N" == libxsmm_blas_xgemm_transa_) ? (M) : libxsmm_blas_xgemm_k_), 1); \
const libxsmm_blasint libxsmm_blas_xgemm_ldb_ = LIBXSMM_MAX(NULL != ((void*)(LDB)) ? *(LDB) : \
*(('n' == libxsmm_blas_xgemm_transb_ || *"N" == libxsmm_blas_xgemm_transb_) ? libxsmm_blas_xgemm_k_ : libxsmm_blas_xgemm_n_), 1); \
const libxsmm_blasint libxsmm_blas_xgemm_ldc_ = LIBXSMM_MAX(NULL != ((void*)(LDC)) ? *(LDC) : *(M), 1); \
const OTYPE libxsmm_blas_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \
const OTYPE libxsmm_blas_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \
LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, gemm)(&libxsmm_blas_xgemm_transa_, &libxsmm_blas_xgemm_transb_, \
M, libxsmm_blas_xgemm_n_, libxsmm_blas_xgemm_k_, \
&libxsmm_blas_xgemm_alpha_, (const ITYPE*)(A), &libxsmm_blas_xgemm_lda_, \
(const ITYPE*)(B), &libxsmm_blas_xgemm_ldb_, \
&libxsmm_blas_xgemm_beta_, (ITYPE*)(C), &libxsmm_blas_xgemm_ldc_); \
}
/** Helper macros for calling a dispatched function in a row/column-major aware fashion. */
#define LIBXSMM_MMCALL_ABC(FN, A, B, C) \
LIBXSMM_ASSERT(FN); FN(A, B, C)
#define LIBXSMM_MMCALL_PRF(FN, A, B, C, PA, PB, PC) { \
LIBXSMM_NOPREFETCH_A(LIBXSMM_UNUSED(PA)); \
LIBXSMM_NOPREFETCH_B(LIBXSMM_UNUSED(PB)); \
LIBXSMM_NOPREFETCH_C(LIBXSMM_UNUSED(PC)); \
LIBXSMM_ASSERT(FN); FN(A, B, C, \
LIBXSMM_GEMM_PREFETCH_A(PA), \
LIBXSMM_GEMM_PREFETCH_B(PB), \
LIBXSMM_GEMM_PREFETCH_C(PC)); \
}
#if (0/*LIBXSMM_GEMM_PREFETCH_NONE*/ == LIBXSMM_PREFETCH)
# define LIBXSMM_MMCALL_LDX(FN, A, B, C, M, N, K, LDA, LDB, LDC) \
LIBXSMM_MMCALL_ABC(FN, A, B, C)
#else
# define LIBXSMM_MMCALL_LDX(FN, A, B, C, M, N, K, LDA, LDB, LDC) \
LIBXSMM_MMCALL_PRF(FN, A, B, C, (A) + ((size_t)LDA) * (K), (B) + ((size_t)LDB) * (N), (C) + ((size_t)LDC) * (N))
#endif
#define LIBXSMM_MMCALL(FN, A, B, C, M, N, K) LIBXSMM_MMCALL_LDX(FN, A, B, C, M, N, K, M, K, M)
/** Calculate problem size from M, N, and K using the correct integer type in order to cover the general case. */
#define LIBXSMM_MNK_SIZE(M, N, K) (((size_t)(M)) * ((size_t)(N)) * ((size_t)(K)))
/** Calculate total number of matrix-elements; matrices A, B, C are given per M, N, K, and emphasize (S) the C-size. */
#define LIBXSMM_SIZE(M, N, K, S) \
(((size_t)(M) * (size_t)(K)) + ((size_t)(K) * (size_t)(N)) + \
(((size_t)(S) * (size_t)(M) * (size_t)(N))))
/** Condition based on arithmetic intensity (AI) */
#define LIBXSMM_SMM_AI(M, N, K, S, TYPESIZE) \
((LIBXSMM_MNK_SIZE(M, N, K) * 2) <= ((size_t)(TYPESIZE) * 4/*AI*/ * LIBXSMM_SIZE(M, N, K, S)))
/** Determine whether an SMM is suitable, i.e., small enough. */
#if !defined(LIBXSMM_THRESHOLD_AI) /* traditional MNK-threshold */
# define LIBXSMM_SMM(M, N, K, S, TYPESIZE) (LIBXSMM_MNK_SIZE(M, N, K) <= (LIBXSMM_MAX_MNK))
#else /* threshold based on arithmetic intensity */
# define LIBXSMM_SMM LIBXSMM_SMM_AI
#endif
/** Fall-back code paths: LIBXSMM_XGEMM_FALLBACK0, and LIBXSMM_XGEMM_FALLBACK1 (macro template). */
#if !defined(LIBXSMM_XGEMM_FALLBACK0)
# define LIBXSMM_XGEMM_FALLBACK0(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, gemm)(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#endif
#if !defined(LIBXSMM_XGEMM_FALLBACK1)
# define LIBXSMM_XGEMM_FALLBACK1(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, gemm)(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#endif
/**
* Execute a specialized function, or use a fallback code path depending on threshold (macro template).
* LIBXSMM_XGEMM_FALLBACK0 or specialized function: below LIBXSMM_MAX_MNK
* LIBXSMM_XGEMM_FALLBACK1: above LIBXSMM_MAX_MNK
*/
#define LIBXSMM_XGEMM(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) { \
const int libxsmm_xgemm_flags_ = LIBXSMM_GEMM_PFLAGS(TRANSA, TRANSB, LIBXSMM_FLAGS) | LIBXSMM_GEMM_XFLAGS(ITYPE, OTYPE); \
const libxsmm_blasint *const libxsmm_xgemm_k_ = (NULL != (K) ? (K) : (M)); \
const libxsmm_blasint *const libxsmm_xgemm_n_ = (NULL != (N) ? (N) : libxsmm_xgemm_k_); \
const libxsmm_blasint libxsmm_xgemm_lda_ = LIBXSMM_MAX(NULL != ((void*)(LDA)) ? *(LDA) : \
*(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & libxsmm_xgemm_flags_) ? (M) : libxsmm_xgemm_k_), 1); \
const libxsmm_blasint libxsmm_xgemm_ldb_ = LIBXSMM_MAX(NULL != ((void*)(LDB)) ? *(LDB) : \
*(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & libxsmm_xgemm_flags_) ? libxsmm_xgemm_k_ : libxsmm_xgemm_n_), 1); \
const libxsmm_blasint libxsmm_xgemm_ldc_ = LIBXSMM_MAX(NULL != (LDC) ? *(LDC) : *(M), 1); \
if (LIBXSMM_SMM(*(M), *libxsmm_xgemm_n_, *libxsmm_xgemm_k_, 2/*RFO*/, sizeof(OTYPE))) { \
const LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) libxsmm_mmfunction_ = LIBXSMM_MMDISPATCH_SYMBOL2(ITYPE, OTYPE)( \
*(M), *libxsmm_xgemm_n_, *libxsmm_xgemm_k_, &libxsmm_xgemm_lda_, &libxsmm_xgemm_ldb_, &libxsmm_xgemm_ldc_, \
(const OTYPE*)(ALPHA), (const OTYPE*)(BETA), &libxsmm_xgemm_flags_, NULL); \
if (NULL != libxsmm_mmfunction_) { \
LIBXSMM_MMCALL_LDX(libxsmm_mmfunction_, (const ITYPE*)(A), (const ITYPE*)(B), (OTYPE*)(C), \
*(M), *libxsmm_xgemm_n_, *libxsmm_xgemm_k_, libxsmm_xgemm_lda_, libxsmm_xgemm_ldb_, libxsmm_xgemm_ldc_); \
} \
else { \
const char libxsmm_xgemm_transa_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & libxsmm_xgemm_flags_) ? 'n' : 't'); \
const char libxsmm_xgemm_transb_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & libxsmm_xgemm_flags_) ? 'n' : 't'); \
const OTYPE libxsmm_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \
const OTYPE libxsmm_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \
LIBXSMM_XGEMM_FALLBACK0(ITYPE, OTYPE, &libxsmm_xgemm_transa_, &libxsmm_xgemm_transb_, \
M, libxsmm_xgemm_n_, libxsmm_xgemm_k_, \
&libxsmm_xgemm_alpha_, A, &libxsmm_xgemm_lda_, \
B, &libxsmm_xgemm_ldb_, \
&libxsmm_xgemm_beta_, C, &libxsmm_xgemm_ldc_); \
} \
} \
else { \
const char libxsmm_xgemm_transa_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & libxsmm_xgemm_flags_) ? 'n' : 't'); \
const char libxsmm_xgemm_transb_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & libxsmm_xgemm_flags_) ? 'n' : 't'); \
const OTYPE libxsmm_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \
const OTYPE libxsmm_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \
LIBXSMM_XGEMM_FALLBACK1(ITYPE, OTYPE, &libxsmm_xgemm_transa_, &libxsmm_xgemm_transb_, \
M, libxsmm_xgemm_n_, libxsmm_xgemm_k_, \
&libxsmm_xgemm_alpha_, A, &libxsmm_xgemm_lda_, \
B, &libxsmm_xgemm_ldb_, \
&libxsmm_xgemm_beta_, C, &libxsmm_xgemm_ldc_); \
} \
}
/** Helper macro to setup a matrix with some initial values. */
#define LIBXSMM_MATINIT_AUX(OMP, TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) { \
/*const*/ double libxsmm_matinit_seed_ = (double)(SEED); /* avoid constant conditional */ \
const double libxsmm_matinit_scale_ = (SCALE) * libxsmm_matinit_seed_ + (SCALE); \
const libxsmm_blasint libxsmm_matinit_nrows_ = (libxsmm_blasint)NROWS; \
const libxsmm_blasint libxsmm_matinit_ld_ = (libxsmm_blasint)LD; \
libxsmm_blasint libxsmm_matinit_i_ = 0, libxsmm_matinit_j_ = 0; \
LIBXSMM_OMP_VAR(libxsmm_matinit_i_); LIBXSMM_OMP_VAR(libxsmm_matinit_j_); \
if (0 != libxsmm_matinit_seed_) { \
OMP(parallel for private(libxsmm_matinit_i_, libxsmm_matinit_j_)) \
for (libxsmm_matinit_i_ = 0; libxsmm_matinit_i_ < ((libxsmm_blasint)NCOLS); ++libxsmm_matinit_i_) { \
for (libxsmm_matinit_j_ = 0; libxsmm_matinit_j_ < libxsmm_matinit_nrows_; ++libxsmm_matinit_j_) { \
const libxsmm_blasint libxsmm_matinit_k_ = libxsmm_matinit_i_ * libxsmm_matinit_ld_ + libxsmm_matinit_j_; \
(DST)[libxsmm_matinit_k_] = (TYPE)(libxsmm_matinit_scale_ * (1.0 + \
libxsmm_matinit_i_ * libxsmm_matinit_nrows_ + libxsmm_matinit_j_)); \
} \
for (; libxsmm_matinit_j_ < libxsmm_matinit_ld_; ++libxsmm_matinit_j_) { \
const libxsmm_blasint libxsmm_matinit_k_ = libxsmm_matinit_i_ * libxsmm_matinit_ld_ + libxsmm_matinit_j_; \
(DST)[libxsmm_matinit_k_] = (TYPE)(SEED); \
} \
} \
} \
else { /* shuffle based initialization */ \
const unsigned int libxsmm_matinit_maxval_ = ((unsigned int)NCOLS) * ((unsigned int)libxsmm_matinit_ld_); \
const TYPE libxsmm_matinit_maxval2_ = (TYPE)(libxsmm_matinit_maxval_ / 2), libxsmm_matinit_inv_ = (TYPE)((SCALE) / libxsmm_matinit_maxval2_); \
const size_t libxsmm_matinit_shuffle_ = libxsmm_shuffle(libxsmm_matinit_maxval_); \
OMP(parallel for private(libxsmm_matinit_i_, libxsmm_matinit_j_)) \
for (libxsmm_matinit_i_ = 0; libxsmm_matinit_i_ < ((libxsmm_blasint)NCOLS); ++libxsmm_matinit_i_) { \
for (libxsmm_matinit_j_ = 0; libxsmm_matinit_j_ < libxsmm_matinit_ld_; ++libxsmm_matinit_j_) { \
const libxsmm_blasint libxsmm_matinit_k_ = libxsmm_matinit_i_ * libxsmm_matinit_ld_ + libxsmm_matinit_j_; \
(DST)[libxsmm_matinit_k_] = libxsmm_matinit_inv_ * /* normalize values to an interval of [-1, +1] */ \
((TYPE)(libxsmm_matinit_shuffle_ * libxsmm_matinit_k_ % libxsmm_matinit_maxval_) - libxsmm_matinit_maxval2_); \
} \
} \
} \
}
#define LIBXSMM_MATINIT(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) \
LIBXSMM_MATINIT_AUX(LIBXSMM_ELIDE, TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE)
#define LIBXSMM_MATINIT_SEQ(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) \
LIBXSMM_MATINIT(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE)
#define LIBXSMM_MATINIT_OMP(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) \
LIBXSMM_MATINIT_AUX(LIBXSMM_PRAGMA_OMP, TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE)
/** Call libxsmm_gemm_print using LIBXSMM's GEMM-flags. */
#define LIBXSMM_GEMM_PRINT(OSTREAM, PRECISION, FLAGS, M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC) \
LIBXSMM_GEMM_PRINT2(OSTREAM, PRECISION, PRECISION, FLAGS, M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC)
#define LIBXSMM_GEMM_PRINT2(OSTREAM, IPREC, OPREC, FLAGS, M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC) \
libxsmm_gemm_dprint2(OSTREAM, (libxsmm_gemm_precision)(IPREC), (libxsmm_gemm_precision)(OPREC), \
/* Use 'n' (instead of 'N') avoids warning about "no macro replacement within a character constant". */ \
(char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & (FLAGS)) ? 'n' : 't'), \
(char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & (FLAGS)) ? 'n' : 't'), \
M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC)
/**
* Utility function, which either prints information about the GEMM call
* or dumps (FILE/ostream=0) all input and output data into MHD files.
* The Meta Image Format (MHD) is suitable for visual inspection using,
* e.g., ITK-SNAP or ParaView.
*/
LIBXSMM_API void libxsmm_gemm_print(void* ostream,
libxsmm_gemm_precision precision, const char* transa, const char* transb,
const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k,
const void* alpha, const void* a, const libxsmm_blasint* lda,
const void* b, const libxsmm_blasint* ldb,
const void* beta, void* c, const libxsmm_blasint* ldc);
LIBXSMM_API void libxsmm_gemm_print2(void* ostream,
libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb,
const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k,
const void* alpha, const void* a, const libxsmm_blasint* lda,
const void* b, const libxsmm_blasint* ldb,
const void* beta, void* c, const libxsmm_blasint* ldc);
LIBXSMM_API void libxsmm_gemm_dprint(void* ostream,
libxsmm_gemm_precision precision, char transa, char transb,
libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
double dalpha, const void* a, libxsmm_blasint lda,
const void* b, libxsmm_blasint ldb,
double dbeta, void* c, libxsmm_blasint ldc);
LIBXSMM_API void libxsmm_gemm_dprint2(void* ostream,
libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, char transa, char transb,
libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
double dalpha, const void* a, libxsmm_blasint lda,
const void* b, libxsmm_blasint ldb,
double dbeta, void* c, libxsmm_blasint ldc);
LIBXSMM_API void libxsmm_gemm_xprint(void* ostream,
libxsmm_xmmfunction kernel, const void* a, const void* b, void* c);
/** GEMM_BATCH: fallback prototype functions served by any compliant LAPACK/BLAS. */
LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dgemm_batch_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm_batch));
LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sgemm_batch_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm_batch));
/** GEMM: fallback prototype functions served by any compliant LAPACK/BLAS. */
LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dgemm_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm));
LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sgemm_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm));
/** GEMV: fallback prototype functions served by any compliant LAPACK/BLAS. */
LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dgemv_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemv));
LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sgemv_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemv));
/** Helper function to consume arguments when called. */
LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sink_function)(LIBXSMM_VARIADIC);
/** The original BLAS functions. */
LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_dgemm_batch_function libxsmm_original_dgemm_batch_function);
LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_sgemm_batch_function libxsmm_original_sgemm_batch_function);
LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_dgemm_function libxsmm_original_dgemm_function);
LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_sgemm_function libxsmm_original_sgemm_function);
LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_dgemv_function libxsmm_original_dgemv_function);
LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_sgemv_function libxsmm_original_sgemv_function);
LIBXSMM_API libxsmm_dgemm_batch_function libxsmm_original_dgemm_batch(void);
LIBXSMM_API libxsmm_sgemm_batch_function libxsmm_original_sgemm_batch(void);
LIBXSMM_API libxsmm_dgemm_function libxsmm_original_dgemm(void);
LIBXSMM_API libxsmm_sgemm_function libxsmm_original_sgemm(void);
LIBXSMM_API libxsmm_dgemv_function libxsmm_original_dgemv(void);
LIBXSMM_API libxsmm_sgemv_function libxsmm_original_sgemv(void);
LIBXSMM_API libxsmm_sink_function libxsmm_blas_error(const char* symbol);
LIBXSMM_API void libxsmm_sink(LIBXSMM_VARIADIC);
/**
* General dense matrix multiplication, which re-exposes LAPACK/BLAS
* but allows to rely on LIBXSMM's defaults (libxsmm_config.h)
* when supplying NULL-arguments in certain places.
*/
LIBXSMM_API void libxsmm_blas_xgemm(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec,
const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k,
const void* alpha, const void* a, const libxsmm_blasint* lda,
const void* b, const libxsmm_blasint* ldb,
const void* beta, void* c, const libxsmm_blasint* ldc);
#define libxsmm_blas_dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
libxsmm_blas_xgemm(LIBXSMM_GEMM_PRECISION_F64, LIBXSMM_GEMM_PRECISION_F64, \
TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#define libxsmm_blas_sgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
libxsmm_blas_xgemm(LIBXSMM_GEMM_PRECISION_F32, LIBXSMM_GEMM_PRECISION_F32, \
TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#define libxsmm_dgemm_omp(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
libxsmm_xgemm_omp(LIBXSMM_GEMM_PRECISION_F64, LIBXSMM_GEMM_PRECISION_F64, \
TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#define libxsmm_sgemm_omp(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
libxsmm_xgemm_omp(LIBXSMM_GEMM_PRECISION_F32, LIBXSMM_GEMM_PRECISION_F32, \
TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
/** Translates GEMM prefetch request into prefetch-enumeration (incl. FE's auto-prefetch). */
LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_xprefetch(const int* prefetch);
LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_prefetch(int prefetch);
/** Determines the given value in double-precision based on the given type. */
LIBXSMM_API int libxsmm_dvalue(libxsmm_datatype datatype, const void* value, double* dvalue);
#endif /*LIBXSMM_FRONTEND_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_FSSPMDM_H
#define LIBXSMM_FSSPMDM_H
#include "libxsmm_typedefs.h"
/** Opaque types for fsspmdm */
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dfsspmdm libxsmm_dfsspmdm;
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_sfsspmdm libxsmm_sfsspmdm;
LIBXSMM_API libxsmm_dfsspmdm* libxsmm_dfsspmdm_create( libxsmm_blasint M, libxsmm_blasint N, libxsmm_blasint K,
libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc,
const double alpha, const double beta, libxsmm_blasint c_is_nt,
const double* a_dense );
LIBXSMM_API void libxsmm_dfsspmdm_execute( const libxsmm_dfsspmdm* handle, const double* B, double* C );
LIBXSMM_API void libxsmm_dfsspmdm_destroy( libxsmm_dfsspmdm* handle );
LIBXSMM_API libxsmm_sfsspmdm* libxsmm_sfsspmdm_create( libxsmm_blasint M, libxsmm_blasint N, libxsmm_blasint K,
libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc,
const float alpha, const float beta, libxsmm_blasint c_is_nt,
const float* a_dense );
LIBXSMM_API void libxsmm_sfsspmdm_execute( const libxsmm_sfsspmdm* handle, const float* B, float* C );
LIBXSMM_API void libxsmm_sfsspmdm_destroy( libxsmm_sfsspmdm* handle );
#endif /*LIBXSMM_FSSPMDM_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke, Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_GENERATOR_H
#define LIBXSMM_GENERATOR_H
#include "libxsmm_typedefs.h"
#define LIBXSMM_GEMM_NO_BYPASS(FLAGS, ALPHA, BETA) ( \
0 == ((FLAGS) & (LIBXSMM_GEMM_FLAG_TRANS_A)) && \
(LIBXSMM_FEQ(1, ALPHA) /*|| LIBXSMM_FEQ(-1, ALPHA)*/) && \
(LIBXSMM_FEQ(1, BETA) || LIBXSMM_FEQ(0, BETA)))
/** Initialize GEMM descriptor as used by low-level routines (type-specific). */
LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_dgemm_descriptor_init(libxsmm_descriptor_blob* blob,
libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc,
double alpha, double beta, int flags, int prefetch);
LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_sgemm_descriptor_init(libxsmm_descriptor_blob* blob,
libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc,
float alpha, float beta, int flags, int prefetch);
LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_wigemm_descriptor_init(libxsmm_descriptor_blob* blob,
libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc,
int alpha, int beta, int flags, int prefetch);
LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bigemm_descriptor_init(libxsmm_descriptor_blob* blob,
libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc,
int alpha, int beta, int flags, int prefetch);
LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bbgemm_descriptor_init(libxsmm_descriptor_blob* blob,
libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc,
int alpha, int beta, int flags, int prefetch);
LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bsgemm_descriptor_init(libxsmm_descriptor_blob* blob,
libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc,
float alpha, float beta, int flags, int prefetch);
LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bgemm_descriptor_init(libxsmm_descriptor_blob* blob,
libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc,
float alpha, float beta, int flags, int prefetch);
/** Initialize GEMM descriptor (generic: double-precision alpha/beta). */
LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_dinit(libxsmm_descriptor_blob* blob,
libxsmm_gemm_precision precision, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, double alpha, double beta,
int flags, int prefetch);
LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_dinit2(libxsmm_descriptor_blob* blob,
libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc,
double alpha, double beta, int flags, int prefetch);
/** Initialize GEMM descriptor as used by low-level routines (generic). */
LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init(libxsmm_descriptor_blob* blob,
libxsmm_gemm_precision precision, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta,
int flags, int prefetch);
LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init2(libxsmm_descriptor_blob* blob,
libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta,
int flags, int prefetch);
/** Similar to libxsmm_gemm_descriptor_init2 with optional type-converted alpha/beta (dalpha/dbeta). */
LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init3(libxsmm_descriptor_blob* blob,
libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta,
int flags, int prefetch, double* dalpha, double* dbeta);
/** Initialize transpose descriptor as used by low-level routines. */
LIBXSMM_API libxsmm_meltw_descriptor* libxsmm_meltw_descriptor_init(libxsmm_descriptor_blob* blob,
libxsmm_datatype in_type, libxsmm_datatype out_type,
libxsmm_blasint m, libxsmm_blasint n,
libxsmm_blasint ldo, libxsmm_blasint ldi,
unsigned short flags, unsigned char param, unsigned char operation);
LIBXSMM_API libxsmm_meltw_descriptor* libxsmm_meltw_descriptor_init2(libxsmm_descriptor_blob* blob,
libxsmm_datatype in_type, libxsmm_datatype in2_type, libxsmm_datatype out_type, libxsmm_datatype out2_type,
libxsmm_blasint m, libxsmm_blasint n,
libxsmm_blasint ldo, libxsmm_blasint ldi, libxsmm_blasint ldi2, libxsmm_blasint ldi3,
unsigned short flags, unsigned char param, unsigned char operation);
/** Initialize matrix equation as used by low-level routines */
LIBXSMM_API libxsmm_meqn_descriptor* libxsmm_meqn_descriptor_init(libxsmm_descriptor_blob* blob,
libxsmm_datatype type, libxsmm_blasint m, libxsmm_blasint n,
libxsmm_blasint ldo, unsigned int eqn_idx);
/** Structure referring to the generated code with some attached information. */
LIBXSMM_EXTERN_C typedef struct libxsmm_generated_code {
void* generated_code; /** pointer to memory which can contain strings or binary code */
unsigned int buffer_size; /** total size if the buffer generated_code */
unsigned int code_size; /** size of bytes used in generated_code */
unsigned int code_type; /**
* 0: generated code contains inline assembly in a C function
* which can be dumped into a *.c/cc/cpp file
* 1: generated code contains assembly which can be
* dumped into an *.s file
* >1: generated code contains a function in binary code which can be
* called, when the code is copied into executable memory
*/
unsigned int last_error; /**
* 0: no error occurred
* >0: error code
*/
unsigned int arch; /* target arch for the current code generation task */
unsigned int sf_size; /* offset of RSP to the beginning of the stack frame
* we track this value to have RBP availbale for general compute
*/
} libxsmm_generated_code;
/** function to translate LIBXSMM Generator error codes to error messages */
LIBXSMM_API
const char* libxsmm_strerror(unsigned int i_error_code);
/* @TODO change int based architecture value */
LIBXSMM_API
void libxsmm_generator_gemm_inlineasm(const char* i_file_out,
const char* i_routine_name,
const libxsmm_gemm_descriptor* i_xgemm_desc,
const char* i_arch );
/* @TODO change int based architecture value */
LIBXSMM_API
void libxsmm_generator_gemm_directasm(const char* i_file_out,
const char* i_routine_name,
const libxsmm_gemm_descriptor* i_xgemm_desc,
const char* i_arch );
LIBXSMM_API
void libxsmm_generator_gemm_kernel(libxsmm_generated_code* io_generated_code,
const libxsmm_gemm_descriptor* i_xgemm_desc );
/* @TODO change int based architecture value */
LIBXSMM_API
void libxsmm_generator_spgemm(const char* i_file_out,
const char* i_routine_name,
const libxsmm_gemm_descriptor* i_xgemm_desc,
const char* i_arch,
const char* i_file_in,
const int i_is_csr);
/* @TODO change int based architecture value */
LIBXSMM_API
void libxsmm_generator_spgemm_csc_kernel(libxsmm_generated_code* io_generated_code,
const libxsmm_gemm_descriptor* i_xgemm_desc,
const char* i_arch,
const unsigned int* i_row_idx,
const unsigned int* i_column_idx,
const double* i_values);
/* @TODO change int based architecture value */
LIBXSMM_API
void libxsmm_generator_spgemm_csr_kernel(libxsmm_generated_code* io_generated_code,
const libxsmm_gemm_descriptor* i_xgemm_desc,
const char* i_arch,
const unsigned int* i_row_idx,
const unsigned int* i_column_idx,
const double* i_values);
/* @TODO change int based architecture value */
LIBXSMM_API
void libxsmm_generator_spgemm_csr_reg_kernel(libxsmm_generated_code* io_generated_code,
const libxsmm_gemm_descriptor* i_xgemm_desc,
const char* i_arch,
const unsigned int* i_row_idx,
const unsigned int* i_column_idx,
const double* i_values);
LIBXSMM_API
void libxsmm_generator_packed_spgemm_csr_kernel( libxsmm_generated_code* io_generated_code,
const libxsmm_gemm_descriptor* i_xgemm_desc,
const unsigned int* i_row_idx,
const unsigned int* i_column_idx,
const void* i_values,
const unsigned int i_packed_width );
LIBXSMM_API
void libxsmm_generator_packed_spgemm_csc_kernel( libxsmm_generated_code* io_generated_code,
const libxsmm_gemm_descriptor* i_xgemm_desc,
const unsigned int* i_row_idx,
const unsigned int* i_column_idx,
const void* i_values,
const unsigned int i_packed_width );
LIBXSMM_API
void libxsmm_generator_packed_gemm_ac_rm( libxsmm_generated_code* io_generated_code,
const libxsmm_gemm_descriptor* i_xgemm_desc,
const unsigned int i_packed_width );
LIBXSMM_API
void libxsmm_generator_packed_gemm_bc_rm( libxsmm_generated_code* io_generated_code,
const libxsmm_gemm_descriptor* i_xgemm_desc,
const unsigned int i_packed_width );
LIBXSMM_API
void libxsmm_generator_mateltwise_kernel( libxsmm_generated_code* io_generated_code,
const libxsmm_meltw_descriptor* i_mateltw_desc );
LIBXSMM_API
void libxsmm_generator_matequation_kernel( libxsmm_generated_code* io_generated_code,
const libxsmm_meqn_descriptor* i_mateqn_desc );
/** Initialization counter that can be used to check whether the library is initialized (!=0) or not (==0). */
LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_ninit);
/** Target architecture (libxsmm_get_target_archid, libxsmm_set_target_archid). */
LIBXSMM_APIVAR_PUBLIC(int libxsmm_target_archid);
/** Verbosity level (0: quiet, 1: errors, 2: warnings, 3: info, neg.: all/dump). */
LIBXSMM_APIVAR_PUBLIC(int libxsmm_verbosity);
/** Security-enhanced environment. */
LIBXSMM_APIVAR_PUBLIC(int libxsmm_se);
#endif /*LIBXSMM_GENERATOR_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_INTRINSICS_X86_H
#define LIBXSMM_INTRINSICS_X86_H
#include "libxsmm_cpuid.h"
/** Macro evaluates to LIBXSMM_ATTRIBUTE_TARGET_xxx (see below). */
#define LIBXSMM_ATTRIBUTE_TARGET(TARGET) LIBXSMM_CONCATENATE(LIBXSMM_ATTRIBUTE_TARGET_, TARGET)
#if !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_PLATFORM_X86)
# define LIBXSMM_INTRINSICS_NONE
#endif
#if /*no intrinsics: tested with 17.x and 18.x*/(defined(__PGI) && \
LIBXSMM_VERSION2(19, 0) > LIBXSMM_VERSION2(__PGIC__, __PGIC_MINOR__)) \
|| /*legacy*/(defined(_CRAYC) && !defined(__GNUC__))
# if !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_INTRINSICS_STATIC)
# define LIBXSMM_INTRINSICS_NONE
# endif
#elif !defined(LIBXSMM_INTRINSICS_STATIC) && !defined(LIBXSMM_INTRINSICS_NONE) && ( \
(defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && \
LIBXSMM_VERSION2(4, 4) > LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) /* GCC 4.4 (target-attribute) */ \
|| (defined(__clang__) && LIBXSMM_VERSION2(3, 7) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) \
|| (defined(__APPLE__) && defined(__MACH__) && !defined(LIBXSMM_INTEL_COMPILER) && defined(__clang__) && \
LIBXSMM_VERSION2(9, 0) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__)))
# define LIBXSMM_INTRINSICS_STATIC
#endif
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
#endif
/** https://github.com/intel/Immintrin-debug */
#if !defined(LIBXSMM_INTRINSICS_DEBUG) && 0
# define LIBXSMM_INTRINSICS_DEBUG
/* workarounds removed after LIBXSMM 1.16.1-1.16.1-1268 */
# include "immintrin_dbg.h"
#endif
#if defined(__MIC__) && !defined(LIBXSMM_INTRINSICS_NONE)
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_TARGET_ARCH_GENERIC
# endif
# define LIBXSMM_INTRINSICS(TARGET)
# define LIBXSMM_INTRINSICS_INCLUDE
#elif !defined(LIBXSMM_INTRINSICS_NONE) /*!defined(__MIC__)*/
# if defined(__AVX512F__) && defined(__AVX512CD__) \
&& defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__) \
&& defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \
&& (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) /* TODO: check GCC, Clang, etc. */ \
|| (LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \
&& (!defined(__clang__) || (LIBXSMM_VERSION2( 9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \
&& (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(99, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__AVX512F__) && defined(__AVX512CD__) \
&& defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) && defined(__AVX512VNNI__) \
&& defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \
&& (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \
|| (LIBXSMM_VERSION2(8, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \
&& (!defined(__clang__) || (LIBXSMM_VERSION2(6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \
&& (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__AVX512F__) && defined(__AVX512CD__) \
&& defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) \
&& defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \
&& (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \
|| (LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \
&& (!defined(__clang__) || (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \
&& (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__AVX512F__) && defined(__AVX512CD__) \
&& defined(__AVX512PF__) && defined(__AVX512ER__) \
&& defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \
&& (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \
|| (LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \
&& (!defined(__clang__) || (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \
&& (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_MIC
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__AVX512F__) && defined(__AVX512CD__) \
&& defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \
&& (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \
|| (LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \
&& (!defined(__clang__) || (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \
&& (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__)
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__)
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__)
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_SSE42
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__SSE3__)
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_SSE3
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(LIBXSMM_PLATFORM_X86)
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_GENERIC
# endif
# if defined(__GNUC__)
# define LIBXSMM_INTRINSICS_INCLUDE
# endif
# endif
# if defined(LIBXSMM_STATIC_TARGET_ARCH) && !defined(LIBXSMM_INTRINSICS_STATIC)
# if defined(__INTEL_COMPILER)
# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
/* TODO: compiler version check for LIBXSMM_MAX_STATIC_TARGET_ARCH */
# if 1904 <= (LIBXSMM_INTEL_COMPILER) && !defined(_WIN32)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX
# elif 1801 <= (LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX
# elif 1500 <= (LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE
# elif 1400 <= (LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_MIC
# else
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2
# endif
# endif
# define LIBXSMM_INTRINSICS(TARGET)/*no need for target flags*/
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(_CRAYC) && defined(__GNUC__)
/* TODO: version check, e.g., LIBXSMM_VERSION2(11, 5) <= LIBXSMM_VERSION2(_RELEASE, _RELEASE_MINOR) */
# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX
# endif
# define LIBXSMM_INTRINSICS(TARGET)/*no need for target flags*/
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(_MSC_VER) && !defined(__clang__)
/* TODO: compiler version check for LIBXSMM_MAX_STATIC_TARGET_ARCH */
# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2
# endif
# define LIBXSMM_INTRINSICS(TARGET)/*no need for target flags*/
# define LIBXSMM_INTRINSICS_INCLUDE
# elif (!defined(__GNUC__) || LIBXSMM_VERSION2(4, 9) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \
&& (!defined(__clang__) || LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) \
&& (!defined(__APPLE__) || !defined(__MACH__)) && !defined(__PGI) && !defined(_MSC_VER)
# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# if defined(__CYGWIN__) && !defined(LIBXSMM_INTRINSICS_DEBUG) /* Cygwin: invalid register for .seh_savexmm */
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2
# elif (defined(__clang__) && LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX
# elif (defined(__GNUC__) && LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \
|| (defined(__clang__) && LIBXSMM_VERSION2( 9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__) && !defined(__cray__))
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX
# elif (defined(__GNUC__) && LIBXSMM_VERSION2(8, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \
|| (defined(__clang__) && LIBXSMM_VERSION2(6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX
# elif (defined(__GNUC__) && LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \
|| (defined(__clang__) && LIBXSMM_VERSION2(6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE
# else
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2
# endif
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# else /* GCC/legacy incl. Clang */
# if defined(__clang__) && !(defined(__APPLE__) && defined(__MACH__)) && !defined(_WIN32)
# if (LIBXSMM_VERSION2(7, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) /* TODO */
/* no limitations */
# elif (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# if !defined(LIBXSMM_INTRINSICS_STATIC) && (LIBXSMM_STATIC_TARGET_ARCH < LIBXSMM_X86_AVX2/*workaround*/)
# define LIBXSMM_INTRINSICS_STATIC
# endif
# elif !defined(LIBXSMM_INTRINSICS_STATIC)
# define LIBXSMM_INTRINSICS_STATIC
# endif
# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# if defined(__CYGWIN__) && !defined(LIBXSMM_INTRINSICS_DEBUG) /* Cygwin: invalid register for .seh_savexmm */
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2
# elif LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX
# elif LIBXSMM_VERSION2( 9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__) && !defined(__cray__)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX
# elif LIBXSMM_VERSION2( 6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX
# else
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE
# endif
# endif
# else /* fallback */
# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_STATIC_TARGET_ARCH
# endif
# if !defined(LIBXSMM_INTRINSICS_STATIC) && (LIBXSMM_STATIC_TARGET_ARCH < LIBXSMM_X86_AVX2/*workaround*/)
# define LIBXSMM_INTRINSICS_STATIC
# endif
# endif
# if !defined(LIBXSMM_INTRINSICS_INCLUDE) && (!defined(__PGI) || LIBXSMM_VERSION2(19, 0) <= LIBXSMM_VERSION2(__PGIC__, __PGIC_MINOR__))
# define LIBXSMM_INTRINSICS_INCLUDE
# endif
# endif /* GCC/legacy incl. Clang */
# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# error "LIBXSMM_MAX_STATIC_TARGET_ARCH not defined!"
# endif
# if defined(LIBXSMM_TARGET_ARCH) && (LIBXSMM_TARGET_ARCH < LIBXSMM_MAX_STATIC_TARGET_ARCH)
# undef LIBXSMM_MAX_STATIC_TARGET_ARCH
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_TARGET_ARCH
# endif
# if defined(LIBXSMM_INTRINSICS_INCLUDE) && !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_INTRINSICS_DEBUG)
# include <immintrin.h>
# endif /*defined(LIBXSMM_INTRINSICS_INCLUDE)*/
# if !defined(LIBXSMM_INTRINSICS)
# if (LIBXSMM_MAX_STATIC_TARGET_ARCH > LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_INTRINSICS(TARGET) LIBXSMM_ATTRIBUTE(LIBXSMM_ATTRIBUTE_TARGET(TARGET))
/* LIBXSMM_ATTRIBUTE_TARGET_xxx is required to literally match the CPUID (libxsmm_cpuid.h)! */
# define LIBXSMM_ATTRIBUTE_TARGET_1002 target("sse2") /* LIBXSMM_X86_GENERIC (64-bit ABI) */
# if (LIBXSMM_X86_SSE3 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1003 target("sse3")
# else
# define LIBXSMM_ATTRIBUTE_TARGET_1003 LIBXSMM_ATTRIBUTE_TARGET_1002
# endif
# if (LIBXSMM_X86_SSE42 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1004 target("sse4.1,sse4.2")
# else
# define LIBXSMM_ATTRIBUTE_TARGET_1004 LIBXSMM_ATTRIBUTE_TARGET_1003
# endif
# if (LIBXSMM_X86_AVX <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1005 target("avx")
# else
# define LIBXSMM_ATTRIBUTE_TARGET_1005 LIBXSMM_ATTRIBUTE_TARGET_1004
# endif
# if (LIBXSMM_X86_AVX2 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1006 target("avx2,fma")
# else
# define LIBXSMM_ATTRIBUTE_TARGET_1006 LIBXSMM_ATTRIBUTE_TARGET_1005
# endif
# if (LIBXSMM_X86_AVX512 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1007 target("avx2,fma,avx512f,avx512cd")
# else
# define LIBXSMM_ATTRIBUTE_TARGET_1007 LIBXSMM_ATTRIBUTE_TARGET_1006
# endif
# if (LIBXSMM_X86_AVX512_MIC <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1010 target("avx2,fma,avx512f,avx512cd,avx512pf,avx512er")
# else /* LIBXSMM_X86_AVX512 */
# define LIBXSMM_ATTRIBUTE_TARGET_1010 LIBXSMM_ATTRIBUTE_TARGET_1007
# endif
# if (LIBXSMM_X86_AVX512_KNM <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1011 target("avx2,fma,avx512f,avx512cd,avx512pf,avx512er,avx5124vnniw,avx5124fmaps")
# else /* LIBXSMM_X86_AVX512_MIC */
# define LIBXSMM_ATTRIBUTE_TARGET_1011 LIBXSMM_ATTRIBUTE_TARGET_1010
# endif
# if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1020 target("avx2,fma,avx512f,avx512cd,avx512dq,avx512bw,avx512vl")
# else /* LIBXSMM_X86_AVX512 */
# define LIBXSMM_ATTRIBUTE_TARGET_1020 LIBXSMM_ATTRIBUTE_TARGET_1007
# endif
# if (LIBXSMM_X86_AVX512_CLX <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1021 target("avx2,fma,avx512f,avx512cd,avx512dq,avx512bw,avx512vl,avx512vnni")
# else /* LIBXSMM_X86_AVX512_CORE */
# define LIBXSMM_ATTRIBUTE_TARGET_1021 LIBXSMM_ATTRIBUTE_TARGET_1020
# endif
# if (LIBXSMM_X86_AVX512_CPX <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1022 target("avx2,fma,avx512f,avx512cd,avx512dq,avx512bw,avx512vl,avx512vnni,avx512bf16")
# else /* LIBXSMM_X86_AVX512_CORE */
# define LIBXSMM_ATTRIBUTE_TARGET_1022 LIBXSMM_ATTRIBUTE_TARGET_1021
# endif
# else
# define LIBXSMM_INTRINSICS(TARGET)/*no need for target flags*/
# endif
# elif !defined(LIBXSMM_INTRINSICS_TARGET)
# define LIBXSMM_INTRINSICS_TARGET
# endif /*!defined(LIBXSMM_INTRINSICS)*/
# endif /*defined(LIBXSMM_STATIC_TARGET_ARCH)*/
#endif /*!defined(LIBXSMM_INTRINSICS_NONE)*/
#if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# if !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_INTRINSICS_STATIC)
# define LIBXSMM_INTRINSICS_NONE
# endif
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_TARGET_ARCH_GENERIC
#endif
#if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_STATIC_TARGET_ARCH
#elif (LIBXSMM_MAX_STATIC_TARGET_ARCH < LIBXSMM_STATIC_TARGET_ARCH)
# undef LIBXSMM_MAX_STATIC_TARGET_ARCH
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_STATIC_TARGET_ARCH
#endif
#if !defined(LIBXSMM_INTRINSICS)
# define LIBXSMM_INTRINSICS(TARGET)
#endif
/** Include basic x86 intrinsics such as __rdtsc. */
#if defined(LIBXSMM_INTRINSICS_INCLUDE) && !defined(LIBXSMM_INTRINSICS_DEBUG)
# if defined(_WIN32)
# include <intrin.h>
# elif defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) || defined(__clang__) || defined(__PGI)
# include <x86intrin.h>
# elif defined(__GNUC__) && (LIBXSMM_VERSION2(4, 4) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))
# include <x86intrin.h>
# endif
# include <xmmintrin.h>
# if defined(__SSE3__)
# include <pmmintrin.h>
# endif
#endif
#if !defined(LIBXSMM_INTRINSICS_NONE)
# if defined(_WIN32)
# include <malloc.h>
# else
# include <mm_malloc.h>
# endif
#endif
/**
* Intrinsic-specific fix-ups
*/
# define LIBXSMM_INTRINSICS_LOADU_SI128(A) _mm_loadu_si128(A)
#if !defined(LIBXSMM_INTEL_COMPILER) && defined(__clang__) && ( \
(LIBXSMM_VERSION2(3, 9) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) \
|| (LIBXSMM_VERSION2(7, 3) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__) && \
defined(__APPLE__) && defined(__MACH__)))
/* prototypes with incorrect signature: _mm512_load_ps takes DP*, _mm512_load_pd takes SP* (checked with v3.8.1) */
# define LIBXSMM_INTRINSICS_MM512_LOAD_PS(A) _mm512_loadu_ps((const double*)(A))
# define LIBXSMM_INTRINSICS_MM512_LOAD_PD(A) _mm512_loadu_pd((const float*)(A))
/* Clang misses _mm512_stream_p? (checked with v3.8.1). */
# define LIBXSMM_INTRINSICS_MM512_STREAM_SI512(A, B) _mm512_store_si512(A, B)
# define LIBXSMM_INTRINSICS_MM512_STREAM_PS(A, B) _mm512_storeu_ps(A, B)
# define LIBXSMM_INTRINSICS_MM512_STREAM_PD(A, B) _mm512_store_pd(A, B)
#else
# define LIBXSMM_INTRINSICS_MM512_LOAD_PS(A) _mm512_loadu_ps((const float*)(A))
# define LIBXSMM_INTRINSICS_MM512_LOAD_PD(A) _mm512_loadu_pd((const double*)(A))
# define LIBXSMM_INTRINSICS_MM512_STREAM_SI512(A, B) _mm512_stream_si512((__m512i*)(A), (B))
# define LIBXSMM_INTRINSICS_MM512_STREAM_PS(A, B) _mm512_stream_ps(A, B)
# define LIBXSMM_INTRINSICS_MM512_STREAM_PD(A, B) _mm512_stream_pd(A, B)
#endif
#if !defined(LIBXSMM_INTEL_COMPILER) || (defined(__clang__) && ( \
(LIBXSMM_VERSION2(8, 0) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__)))) \
|| (defined(__APPLE__) && defined(__MACH__)) || defined(__GNUC__)
# define LIBXSMM_INTRINSICS_MM256_STORE_EPI32(A, B) _mm256_storeu_si256((__m256i*)(A), B)
#else
# define LIBXSMM_INTRINSICS_MM256_STORE_EPI32(A, B) _mm256_storeu_epi32(A, B)
#endif
#if defined(LIBXSMM_INTEL_COMPILER)
# if 1600 <= (LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_INTRINSICS_MM512_SET_EPI16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \
E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0) \
_mm512_set_epi16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \
E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0)
# else
# define LIBXSMM_INTRINSICS_MM512_SET_EPI16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \
E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0) \
_mm512_castps_si512(_mm512_set_epi16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \
E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0))
# endif
#else
# define LIBXSMM_INTRINSICS_MM512_SET_EPI16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \
E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0) \
_mm512_set_epi32(((E31) << 16) | (E30), ((E29) << 16) | (E28), ((E27) << 16) | (E26), ((E25) << 16) | (E24), \
((E23) << 16) | (E22), ((E21) << 16) | (E20), ((E19) << 16) | (E18), ((E17) << 16) | (E16), \
((E15) << 16) | (E14), ((E13) << 16) | (E12), ((E11) << 16) | (E10), ((E9) << 16) | (E8), \
((E7) << 16) | (E6), ((E5) << 16) | (E4), ((E3) << 16) | (E2), ((E1) << 16) | (E0))
#endif
#if defined(LIBXSMM_INTEL_COMPILER) \
|| (defined(__GNUC__) && LIBXSMM_VERSION2(7, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \
|| (defined(__clang__) && (!defined(__APPLE__) || !defined(__MACH__)) \
&& LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# define LIBXSMM_INTRINSICS_MM512_MASK_I32GATHER_EPI32(A, B, C, D, E) _mm512_mask_i32gather_epi32(A, B, C, D, E)
# define LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(A, B) _mm512_extracti64x4_epi64(A, B)
# define LIBXSMM_INTRINSICS_MM512_ABS_PS(A) _mm512_abs_ps(A)
# define LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32() _mm512_undefined_epi32()
# define LIBXSMM_INTRINSICS_MM512_UNDEFINED() _mm512_undefined()
# define LIBXSMM_INTRINSICS_MM256_UNDEFINED_SI256() _mm256_undefined_si256()
# define LIBXSMM_INTRINSICS_MM_UNDEFINED_SI128() _mm_undefined_si128()
# define LIBXSMM_INTRINSICS_MM_UNDEFINED_PD() _mm_undefined_pd()
#else
# define LIBXSMM_INTRINSICS_MM512_MASK_I32GATHER_EPI32(A, B, C, D, E) _mm512_castps_si512(_mm512_mask_i32gather_ps( \
_mm512_castsi512_ps(A), B, C, (const float*)(D), E))
# define LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(A, B) _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(A), B))
# define LIBXSMM_INTRINSICS_MM512_ABS_PS(A) _mm512_castsi512_ps(_mm512_and_epi32( \
_mm512_castps_si512(A), _mm512_set1_epi32(0x7FFFFFFF)))
# define LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32() _mm512_set1_epi32(0)
# define LIBXSMM_INTRINSICS_MM512_UNDEFINED() _mm512_set1_ps(0)
# define LIBXSMM_INTRINSICS_MM256_UNDEFINED_SI256() _mm256_set1_epi32(0)
# define LIBXSMM_INTRINSICS_MM_UNDEFINED_SI128() _mm_set1_epi32(0)
# define LIBXSMM_INTRINSICS_MM_UNDEFINED_PD() _mm_set1_pd(0)
#endif
#if (defined(LIBXSMM_INTEL_COMPILER) && (1800 <= (LIBXSMM_INTEL_COMPILER))) \
|| (!defined(LIBXSMM_INTEL_COMPILER) && defined(__GNUC__) \
&& LIBXSMM_VERSION2(7, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \
|| ((!defined(__APPLE__) || !defined(__MACH__)) && defined(__clang__) \
&& LIBXSMM_VERSION2(8, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# define LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, NBITS) \
LIBXSMM_CONCATENATE(_store_mask, NBITS)((LIBXSMM_CONCATENATE(__mmask, NBITS)*)(DST_PTR), SRC)
# define LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, NBITS) \
LIBXSMM_CONCATENATE(_load_mask, NBITS)((/*const*/ LIBXSMM_CONCATENATE(__mmask, NBITS)*)(SRC_PTR))
# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, NBITS) LIBXSMM_CONCATENATE(_cvtu32_mask, NBITS)((unsigned int)(A))
#elif defined(LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, NBITS) \
(*(LIBXSMM_CONCATENATE(__mmask, NBITS)*)(DST_PTR) = (LIBXSMM_CONCATENATE(__mmask, NBITS))(SRC))
# define LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, NBITS) \
((LIBXSMM_CONCATENATE(__mmask, NBITS))_mm512_mask2int(*(const __mmask16*)(SRC_PTR)))
# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, NBITS) LIBXSMM_CONCATENATE(LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_, NBITS)(A)
# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_32(A) ((__mmask32)(A))
# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_16(A) _mm512_int2mask((int)(A))
# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_8(A) ((__mmask8)(A))
#else
# define LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, NBITS) \
(*(LIBXSMM_CONCATENATE(__mmask, NBITS)*)(DST_PTR) = (LIBXSMM_CONCATENATE(__mmask, NBITS))(SRC))
# define LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, NBITS) (*(const LIBXSMM_CONCATENATE(__mmask, NBITS)*)(SRC_PTR))
# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, NBITS) ((LIBXSMM_CONCATENATE(__mmask, NBITS))(A))
#endif
#define LIBXSMM_INTRINSICS_MM512_STORE_MASK64(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 64)
#define LIBXSMM_INTRINSICS_MM512_STORE_MASK32(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 32)
#define LIBXSMM_INTRINSICS_MM512_STORE_MASK16(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 16)
#define LIBXSMM_INTRINSICS_MM512_STORE_MASK8(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 8)
#define LIBXSMM_INTRINSICS_MM512_LOAD_MASK64(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 64)
#define LIBXSMM_INTRINSICS_MM512_LOAD_MASK32(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 32)
#define LIBXSMM_INTRINSICS_MM512_LOAD_MASK16(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 16)
#define LIBXSMM_INTRINSICS_MM512_LOAD_MASK8(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 8)
#define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK32(A) LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, 32)
#define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK16(A) LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, 16)
#define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK8(A) LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, 8)
/**
* Pseudo intrinsics for portability
*/
LIBXSMM_API_INLINE int LIBXSMM_INTRINSICS_BITSCANFWD32_SW(unsigned int n) {
unsigned int i, r = 0; if (0 != n) for (i = 1; 0 == (n & i); i <<= 1) { ++r; } return r;
}
LIBXSMM_API_INLINE int LIBXSMM_INTRINSICS_BITSCANFWD64_SW(unsigned long long n) {
unsigned int i, r = 0; if (0 != n) for (i = 1; 0 == (n & i); i <<= 1) { ++r; } return r;
}
/** Binary Logarithm (based on Stackoverflow's NBITSx macro). */
#define LIBXSMM_INTRINSICS_BITSCANBWD_SW02(N) (0 != ((N) & 0x2/*0b10*/) ? 1 : 0)
#define LIBXSMM_INTRINSICS_BITSCANBWD_SW04(N) (0 != ((N) & 0xC/*0b1100*/) ? (2 | LIBXSMM_INTRINSICS_BITSCANBWD_SW02((N) >> 2)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW02(N))
#define LIBXSMM_INTRINSICS_BITSCANBWD_SW08(N) (0 != ((N) & 0xF0/*0b11110000*/) ? (4 | LIBXSMM_INTRINSICS_BITSCANBWD_SW04((N) >> 4)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW04(N))
#define LIBXSMM_INTRINSICS_BITSCANBWD_SW16(N) (0 != ((N) & 0xFF00) ? (8 | LIBXSMM_INTRINSICS_BITSCANBWD_SW08((N) >> 8)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW08(N))
#define LIBXSMM_INTRINSICS_BITSCANBWD_SW32(N) (0 != ((N) & 0xFFFF0000) ? (16 | LIBXSMM_INTRINSICS_BITSCANBWD_SW16((N) >> 16)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW16(N))
#define LIBXSMM_INTRINSICS_BITSCANBWD_SW64(N) (0 != ((N) & 0xFFFFFFFF00000000) ? (32 | LIBXSMM_INTRINSICS_BITSCANBWD_SW32((N) >> 32)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW32(N))
#define LIBXSMM_INTRINSICS_BITSCANBWD32_SW(N) LIBXSMM_INTRINSICS_BITSCANBWD_SW32((unsigned int)(N))
#define LIBXSMM_INTRINSICS_BITSCANBWD64_SW(N) LIBXSMM_INTRINSICS_BITSCANBWD_SW64((unsigned long long)(N))
#if defined(_WIN32) && !defined(LIBXSMM_INTRINSICS_NONE)
LIBXSMM_API_INLINE unsigned int LIBXSMM_INTRINSICS_BITSCANFWD32(unsigned int n) {
unsigned long r = 0; _BitScanForward(&r, n); return (0 != n) * r;
}
LIBXSMM_API_INLINE unsigned int LIBXSMM_INTRINSICS_BITSCANBWD32(unsigned int n) {
unsigned long r = 0; _BitScanReverse(&r, n); return r;
}
# if defined(_WIN64)
LIBXSMM_API_INLINE unsigned int LIBXSMM_INTRINSICS_BITSCANFWD64(unsigned long long n) {
unsigned long r = 0; _BitScanForward64(&r, n); return (0 != n) * r;
}
LIBXSMM_API_INLINE unsigned int LIBXSMM_INTRINSICS_BITSCANBWD64(unsigned long long n) {
unsigned long r = 0; _BitScanReverse64(&r, n); return r;
}
# else
# define LIBXSMM_INTRINSICS_BITSCANFWD64 LIBXSMM_INTRINSICS_BITSCANFWD64_SW
# define LIBXSMM_INTRINSICS_BITSCANBWD64 LIBXSMM_INTRINSICS_BITSCANBWD64_SW
# endif
#elif defined(__GNUC__) && !defined(LIBXSMM_INTRINSICS_NONE)
# define LIBXSMM_INTRINSICS_BITSCANFWD32(N) ((0 != (N)) * __builtin_ctz(N))
# define LIBXSMM_INTRINSICS_BITSCANFWD64(N) ((0 != (N)) * __builtin_ctzll(N))
# define LIBXSMM_INTRINSICS_BITSCANBWD32(N) ((0 != (N)) * (31 - __builtin_clz(N)))
# define LIBXSMM_INTRINSICS_BITSCANBWD64(N) ((0 != (N)) * (63 - __builtin_clzll(N)))
#else /* fallback implementation */
# define LIBXSMM_INTRINSICS_BITSCANFWD32 LIBXSMM_INTRINSICS_BITSCANFWD32_SW
# define LIBXSMM_INTRINSICS_BITSCANFWD64 LIBXSMM_INTRINSICS_BITSCANFWD64_SW
# define LIBXSMM_INTRINSICS_BITSCANBWD32 LIBXSMM_INTRINSICS_BITSCANBWD32_SW
# define LIBXSMM_INTRINSICS_BITSCANBWD64 LIBXSMM_INTRINSICS_BITSCANBWD64_SW
#endif
/** LIBXSMM_NBITS determines the minimum number of bits needed to represent N. */
#define LIBXSMM_NBITS(N) (LIBXSMM_INTRINSICS_BITSCANBWD64(N) + LIBXSMM_MIN(1, N))
#define LIBXSMM_ISQRT2(N) ((unsigned int)((1ULL << (LIBXSMM_NBITS(N) >> 1)) /*+ LIBXSMM_MIN(1, N)*/))
/** LIBXSMM_ILOG2 definition matches ceil(log2(N)). */
LIBXSMM_API_INLINE unsigned int LIBXSMM_ILOG2(unsigned long long n) {
unsigned int result = 0; if (1 < n) {
const unsigned int m = LIBXSMM_INTRINSICS_BITSCANBWD64(n);
result = m + ((unsigned int)LIBXSMM_INTRINSICS_BITSCANBWD64(n - 1) == m);
} return result;
}
/**
* Target attribution
*/
#if !defined(LIBXSMM_INTRINSICS_KNC) && !defined(LIBXSMM_INTRINSICS_NONE) && defined(__MIC__)
# define LIBXSMM_INTRINSICS_KNC
#endif
/** LIBXSMM_INTRINSICS_X86 is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_X86) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_GENERIC <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_GENERIC <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_X86
#endif
/** LIBXSMM_INTRINSICS_SSE3 is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_SSE3) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_SSE3 <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_SSE3 <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_SSE3
#endif
/** LIBXSMM_INTRINSICS_SSE42 is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_SSE42) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_SSE42 <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_SSE42 <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_SSE42
#endif
/** LIBXSMM_INTRINSICS_AVX is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_AVX
#endif
/** LIBXSMM_INTRINSICS_AVX2 is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX2) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX2 <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_AVX2
#endif
/** LIBXSMM_INTRINSICS_AVX512 is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX512) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512 <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_AVX512
#endif
/** LIBXSMM_INTRINSICS_AVX512_MIC is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX512_MIC) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_MIC <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_MIC <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_AVX512_MIC
#endif
/** LIBXSMM_INTRINSICS_AVX512_KNM is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX512_KNM) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_KNM <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_KNM <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_AVX512_KNM
#endif
/** LIBXSMM_INTRINSICS_AVX512_CORE is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX512_CORE) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_CORE <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_AVX512_CORE
#endif
/** LIBXSMM_INTRINSICS_AVX512_CLX is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX512_CLX) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_CLX <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_CLX <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_AVX512_CLX
#endif
/** LIBXSMM_INTRINSICS_AVX512_CPX is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX512_CPX) && !defined(LIBXSMM_INTRINSICS_NONE) && defined(LIBXSMM_X86_AVX512_CPX) && \
!defined(LIBXSMM_INTRINSICS_STATIC) && (LIBXSMM_X86_AVX512_CPX <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_INTRINSICS_AVX512_CPX
#endif
/** 2048-bit state for xoshiro128+ RNG (state/symbols needed even if AVX-512 is not used) */
#define LIBXSMM_INTRINSICS_MM512_RNG_STATE(INDEX) (*(__m512i*)LIBXSMM_CONCATENATE(libxsmm_intrinsics_mm512_rng_state, INDEX))
LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_intrinsics_mm512_rng_state0[16]);
LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_intrinsics_mm512_rng_state1[16]);
LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_intrinsics_mm512_rng_state2[16]);
LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_intrinsics_mm512_rng_state3[16]);
/**
* Pseudo intrinsics (AVX-2)
*/
#if defined(LIBXSMM_INTRINSICS_AVX2) /*__AVX2__*/
# if defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && 0
LIBXSMM_PRAGMA_OPTIMIZE_OFF /* avoid ICE in case of symbols (-g) */
# endif
/** Generate random number in the interval [0, 1); thread save, state needs to be managed by user.
* this is based on xoshiro128+ 1.0, e.g. http://prng.di.unimi.it/xoshiro128plus.c */
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) __m256i LIBXSMM_INTRINSICS_MM256_RNG_XOSHIRO128P_EXTSTATE_EPI32(unsigned int* stateptr) {
__m256i state_0 = _mm256_loadu_si256( (const __m256i*)stateptr );
__m256i state_1 = _mm256_loadu_si256( (const __m256i*)(stateptr+16) );
__m256i state_2 = _mm256_loadu_si256( (const __m256i*)(stateptr+32) );
__m256i state_3 = _mm256_loadu_si256( (const __m256i*)(stateptr+48) );
const __m256i result = _mm256_add_epi32(state_0, state_3);
const __m256i s = _mm256_slli_epi32(state_1, 9);
__m256i t;
state_2 = _mm256_xor_si256(state_2, state_0);
state_3 = _mm256_xor_si256(state_3, state_1);
state_1 = _mm256_xor_si256(state_1, state_2);
state_0 = _mm256_xor_si256(state_0, state_3);
state_2 = _mm256_xor_si256(state_2, s);
_mm256_storeu_si256( (__m256i*)stateptr , state_0 );
_mm256_storeu_si256( (__m256i*)(stateptr+16), state_1 );
_mm256_storeu_si256( (__m256i*)(stateptr+32), state_2 );
t = _mm256_slli_epi32(state_3, 11);
state_3 = _mm256_or_si256(t, _mm256_srli_epi32(state_3, 32 - 11));
_mm256_storeu_si256( (__m256i*)(stateptr+48), state_3 );
return result;
}
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) __m256 LIBXSMM_INTRINSICS_MM256_RNG_EXTSTATE_PS(unsigned int* stateptr) {
const __m256i rng_mantissa = _mm256_srli_epi32( LIBXSMM_INTRINSICS_MM256_RNG_XOSHIRO128P_EXTSTATE_EPI32(stateptr), 9 );
const __m256 one = _mm256_set1_ps(1.0f);
return _mm256_sub_ps(_mm256_castsi256_ps(_mm256_or_si256(_mm256_set1_epi32(0x3f800000), rng_mantissa)), one);
}
# if defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && 0
LIBXSMM_PRAGMA_OPTIMIZE_ON
# endif
#endif /*__AVX2__*/
/**
* Pseudo intrinsics (AVX-512)
*/
#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/
# define LIBXSMM_INTRINSICS_MM512_QUANTIZE_NEAR_PS_EPI16( A, B ) _mm512_cvtepi32_epi16(_mm512_cvt_roundps_epi32( \
_mm512_mul_ps(LIBXSMM_INTRINSICS_MM512_LOAD_PS(A), B), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512i LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(__m512 a) {
const __m512i vnaninf = _mm512_set1_epi32(0x7f800000), vrneadd = _mm512_set1_epi32(0x00007fff);
const __m512i vfixup = _mm512_set1_epi32(0x00000001), vfixupmask = _mm512_set1_epi32(0x00010000);
const __m512i mm512_roundbf16rne_a_ = _mm512_castps_si512(a);
const __mmask16 mm512_roundbf16rne_mask1_ = _mm512_cmp_epi32_mask(_mm512_and_epi32(mm512_roundbf16rne_a_, vnaninf), vnaninf, _MM_CMPINT_NE);
const __mmask16 mm512_roundbf16rne_mask2_ = _mm512_cmp_epi32_mask(_mm512_and_epi32(mm512_roundbf16rne_a_, vfixupmask), vfixupmask, _MM_CMPINT_EQ);
return _mm512_mask_add_epi32(mm512_roundbf16rne_a_, mm512_roundbf16rne_mask1_, mm512_roundbf16rne_a_, _mm512_mask_add_epi32(vrneadd, mm512_roundbf16rne_mask2_, vrneadd, vfixup));
}
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m256i LIBXSMM_INTRINSICS_MM512_CVT_FP32_BF16(__m512 a) {
return _mm512_cvtepi32_epi16(_mm512_srai_epi32(LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(a), 16));
}
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512i LIBXSMM_INTRINSICS_MM512_CVT2_FP32_BF16(__m512 a, __m512 b) {
const __m256i aa = _mm512_cvtepi32_epi16(_mm512_srai_epi32(LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(b), 16));
const __m256i bb = _mm512_cvtepi32_epi16(_mm512_srai_epi32(LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(a), 16));
return _mm512_inserti64x4(_mm512_inserti64x4(_mm512_setzero_si512(), aa, 0), bb, 1);
}
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(__m256i a) {
return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(a),16));
}
/** SVML-intrinsics */
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_78(__m512 x) {
const __m512 c0 = _mm512_set1_ps(2027025.0f);
const __m512 c1 = _mm512_set1_ps(270270.0f);
const __m512 c2 = _mm512_set1_ps(6930.0f);
const __m512 c3 = _mm512_set1_ps(36.0f);
const __m512 c1_d = _mm512_set1_ps(945945.0f);
const __m512 c2_d = _mm512_set1_ps(51975.0f);
const __m512 c3_d = _mm512_set1_ps(630.0f);
const __m512 hi_bound = _mm512_set1_ps(4.97f);
const __m512 lo_bound = _mm512_set1_ps(-4.97f);
const __m512 ones = _mm512_set1_ps(1.0f);
const __m512 neg_ones = _mm512_set1_ps(-1.0f);
const __m512 x2 = _mm512_mul_ps( x, x );
const __m512 t1_nom = _mm512_fmadd_ps( c3, x2, c2 );
const __m512 t2_nom = _mm512_fmadd_ps( t1_nom, x2, c1 );
const __m512 t3_nom = _mm512_fmadd_ps( t2_nom, x2, c0 );
const __m512 nom = _mm512_mul_ps( t3_nom, x );
const __m512 t1_denom = _mm512_add_ps( x2, c3_d );
const __m512 t2_denom = _mm512_fmadd_ps( t1_denom, x2, c2_d );
const __m512 t3_denom = _mm512_fmadd_ps( t2_denom, x2, c1_d );
const __m512 denom = _mm512_fmadd_ps( t3_denom, x2, c0 );
const __m512 denom_rcp = _mm512_rcp14_ps( denom );
const __mmask16 mask_hi = _mm512_cmp_ps_mask( x, hi_bound, _CMP_GT_OQ);
const __mmask16 mask_lo = _mm512_cmp_ps_mask( x, lo_bound, _CMP_LT_OQ);
__m512 result = _mm512_mul_ps( nom, denom_rcp );
result = _mm512_mask_blend_ps(mask_hi, result, ones);
result = _mm512_mask_blend_ps(mask_lo, result, neg_ones);
return result;
}
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_32(__m512 x) {
const __m512 c1 = _mm512_set1_ps((float)(1.0/27.0));
const __m512 c2 = _mm512_set1_ps((float)(1.0/3));
const __m512 hi_bound = _mm512_set1_ps(3.2f);
const __m512 lo_bound = _mm512_set1_ps(-3.2f);
const __m512 ones = _mm512_set1_ps(1.0f);
const __m512 neg_ones = _mm512_set1_ps(-1.0f);
const __m512 x2 = _mm512_mul_ps( x, x );
const __m512 t1_nom = _mm512_fmadd_ps( x2, c1, ones);
const __m512 nom = _mm512_mul_ps( t1_nom, x );
const __m512 denom = _mm512_fmadd_ps( x2, c2, ones);
const __m512 denom_rcp = _mm512_rcp14_ps( denom );
const __mmask16 mask_hi = _mm512_cmp_ps_mask( x, hi_bound, _CMP_GT_OQ);
const __mmask16 mask_lo = _mm512_cmp_ps_mask( x, lo_bound, _CMP_LT_OQ);
__m512 result = _mm512_mul_ps(nom, denom_rcp);
result = _mm512_mask_blend_ps(mask_hi, result, ones);
result = _mm512_mask_blend_ps(mask_lo, result, neg_ones);
return result;
}
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_EXP2(__m512 _x) {
const __m512 twice_log2_e = _mm512_set1_ps((float)(1.442695*2));
const __m512 half = _mm512_set1_ps(0.5f);
const __m512 c2 = _mm512_set1_ps(0.240226507f);
const __m512 c1 = _mm512_set1_ps(0.452920674f);
const __m512 c0 = _mm512_set1_ps(0.713483036f);
const __m512 ones = _mm512_set1_ps(1.0f);
const __m512 minus_twos = _mm512_set1_ps(-2.0f);
const __m512 x = _mm512_fmadd_ps(_x, twice_log2_e, half);
#if 1
const __m512 y = _mm512_sub_ps(x, _mm512_roundscale_round_ps(x, 1, _MM_FROUND_CUR_DIRECTION));
#else
const __m512 y = _mm512_reduce_ps(x, 1);
#endif
const __m512 t1 = _mm512_fmadd_ps( y, c2, c1);
const __m512 two_to_y = _mm512_fmadd_ps( y, t1, c0);
const __m512 exp = _mm512_scalef_ps( two_to_y, x );
const __m512 denom_rcp = _mm512_rcp14_ps( _mm512_add_ps( exp, ones) );
__m512 result = _mm512_fmadd_ps( denom_rcp, minus_twos, ones);
return result;
}
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_EXP3(__m512 _x) {
const __m512 twice_log2_e = _mm512_set1_ps((float)(1.442695*2));
const __m512 half = _mm512_set1_ps(0.5f);
const __m512 c3 = _mm512_set1_ps(0.05550410866f);
const __m512 c2 = _mm512_set1_ps(0.15697034396f);
const __m512 c1 = _mm512_set1_ps(0.49454875509f);
const __m512 c0 = _mm512_set1_ps(0.70654502287f);
const __m512 ones = _mm512_set1_ps(1.0f);
const __m512 minus_twos = _mm512_set1_ps(-2.0f);
const __m512 x = _mm512_fmadd_ps(_x, twice_log2_e, half);
#if 1
const __m512 y = _mm512_sub_ps(x, _mm512_roundscale_round_ps(x, 1, _MM_FROUND_CUR_DIRECTION));
#else
const __m512 y = _mm512_reduce_ps(x, 1);
#endif
const __m512 t1 = _mm512_fmadd_ps( y, c3, c2);
const __m512 t2 = _mm512_fmadd_ps( y, t1, c1);
const __m512 two_to_y = _mm512_fmadd_ps( y, t2, c0);
const __m512 exp = _mm512_scalef_ps( two_to_y, x );
const __m512 denom_rcp = _mm512_rcp14_ps( _mm512_add_ps( exp, ones) );
__m512 result = _mm512_fmadd_ps( denom_rcp, minus_twos, ones);
return result;
}
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(__m512 x) {
__m512 result, func_p0, func_p1, func_p2;
const __m512i sign_mask = _mm512_set1_epi32( 0x80000000 );
const __m512i sign_filter = _mm512_set1_epi32( 0x7FFFFFFF );
const __m512i lut_low = _mm512_set1_epi32( 246 );
const __m512i lut_high = _mm512_set1_epi32( 261 );
const __m512 tanh_p0_2_reg = _mm512_set_ps( 0.40555000f, 0.11892800f, -0.00972979f, -0.02740300f, -0.0169851f, -0.00776152f, -0.00305889f,
-0.00116259f, -0.00041726f, -8.53233e-6f, 1.0000000f, 0.99999800f, 0.99975400f, 0.99268200f,
0.93645300f, 0.73833900f);
const __m512 tanh_p1_2_reg = _mm512_set_ps( 0.495602f, 0.88152f, 1.125700000f, 1.17021000f, 1.1289000000f, 1.07929000f, 1.0432300f, 1.023010f,
1.011620f, 1.00164f, 1.56828e-14f, 4.49924e-7f, 0.0000646924f, 0.00260405f, 0.0311608f, 0.168736f);
const __m512 tanh_p2_2_reg = _mm512_set_ps(-0.108238f, -0.2384280f, -0.354418000f, -0.38240300f, -0.34135700f, -0.274509000f, -0.20524900f, -0.1511960f,
-0.107635f, -0.0466868f, -3.60822e-16f, -2.05971e-8f, -4.24538e-6f, -0.000231709f, -0.00386434f, -0.0277702f);
const __m512i signs = _mm512_and_epi32(_mm512_castps_si512(x), sign_mask);
const __m512i abs_arg = _mm512_and_epi32(_mm512_castps_si512(x), sign_filter);
__m512i indices = _mm512_srli_epi32(abs_arg, 22);
indices = _mm512_max_epi32(indices, lut_low);
indices = _mm512_min_epi32(indices, lut_high);
func_p0 = _mm512_permutexvar_ps(indices, tanh_p0_2_reg);
func_p1 = _mm512_permutexvar_ps(indices, tanh_p1_2_reg);
func_p2 = _mm512_permutexvar_ps(indices, tanh_p2_2_reg);
result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), func_p2, func_p1);
result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), result, func_p0);
result = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(result), signs));
return result;
}
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX3(__m512 x) {
__m512 result, func_p0, func_p1, func_p2, func_p3;
const __m512i sign_mask = _mm512_set1_epi32( 0x80000000 );
const __m512i sign_filter = _mm512_set1_epi32( 0x7FFFFFFF );
const __m512i lut_low = _mm512_set1_epi32( 246 );
const __m512i lut_high = _mm512_set1_epi32( 261 );
const __m512 tanh_p0_3_reg = _mm512_setr_ps( 0.466283000f, 0.82850600f, 0.97437500f, 0.99882600f, 0.9999860f, 1.0000000f, -1.50006e-08f, -7.98169e-06f,
-4.53753e-05f, -0.00023755f, -0.00125285f, -0.00572314f, -0.0227717f, -0.0629089f, -0.084234300f, 0.071199800f);
const __m512 tanh_p1_3_reg = _mm512_setr_ps( 0.500617f, 0.124369f, 0.0137214f, 0.000464124f, 4.02465e-06f, 0.00000f, 1.00001f, 1.00028f, 1.00112f, 1.00414f,
1.015570f, 1.050950f, 1.1478500f, 1.310130000f, 1.378950000f, 1.07407f);
const __m512 tanh_p2_3_reg = _mm512_setr_ps(-0.16133200f, -0.0305526f, -0.00245909f, -6.12647e-05f, -3.76127e-07f, 0.000000f, -0.000245872f, -0.00341151f,
-0.00971505f, -0.0256817f, -0.06869110f, -0.162433000f, -0.346828000f, -0.566516f, -0.640214000f, -0.44011900f);
const __m512 tanh_p3_3_reg = _mm512_setr_ps( 0.0177393f, 0.00253432f, 0.000147303f, 2.69963e-06f, 1.16764e-08f, 0.0000000f, -0.330125f, -0.3176210f,
-0.3017760f, -0.27358000f, -0.219375000f, -0.136197000f, -0.01868680f, 0.0808901f, 0.107095f, 0.0631459f);
const __m512i signs = _mm512_and_epi32(_mm512_castps_si512(x), sign_mask);
const __m512i abs_arg = _mm512_and_epi32(_mm512_castps_si512(x), sign_filter);
__m512i indices = _mm512_srli_epi32(abs_arg, 22);
indices = _mm512_max_epi32(indices, lut_low);
indices = _mm512_min_epi32(indices, lut_high);
func_p0 = _mm512_permutexvar_ps(indices, tanh_p0_3_reg);
func_p1 = _mm512_permutexvar_ps(indices, tanh_p1_3_reg);
func_p2 = _mm512_permutexvar_ps(indices, tanh_p2_3_reg);
func_p3 = _mm512_permutexvar_ps(indices, tanh_p3_3_reg);
result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), func_p3, func_p2);
result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), result, func_p1);
result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), result, func_p0);
result = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(result), signs));
return result;
}
#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512DQ__ needed*/
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) __m512 LIBXSMM_INTRINSICS_MM512_GELU_FWD_PS_MINIMAX3(__m512 x) {
const __m512 thres = _mm512_castsi512_ps(_mm512_set1_epi32(0x40879fff));
const __m512 absmask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff));
const __m512 scale = _mm512_castsi512_ps(_mm512_set1_epi32(0x406a0ea1));
const __m512 shifter = _mm512_castsi512_ps(_mm512_set1_epi32(0x4b400000));
const __m512 half = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f000000));
const __m512 _c2 = _mm512_castsi512_ps(_mm512_setr_epi32(0xbd877b85u, 0xbd7d9780u, 0xbd4cb70eu, 0xbd08a1e9u, 0xbc808857u, 0xb9476fd2u, 0x3c36f765u, 0x3c924160u,
0x3ca7b1fcu, 0x3ca5732cu, 0x3c95af63u, 0x3c8079f7u, 0x3c55fa4fu, 0x3c2fa86bu, 0x3c0fbb00u, 0x3bec178cu));
const __m512 _c1 = _mm512_castsi512_ps(_mm512_setr_epi32(0xb7c7fb58u, 0xbacb9740u, 0xbc3e4b3au, 0xbd0d292au, 0xbd8bc5d0u, 0xbdd9978fu, 0xbe0f92d3u, 0xbe27b66du,
0xbe328ce7u, 0xbe3125bfu, 0xbe26dc9du, 0xbe17a056u, 0xbe06bdebu, 0xbdecc593u, 0xbdcf57aau, 0xbdb5ea3au));
const __m512 _c0 = _mm512_castsi512_ps(_mm512_setr_epi32(0x3ecc4231u, 0x3ecc541cu, 0x3ecd6c48u, 0x3ed174c3u, 0x3ed9bd5du, 0x3ee5acd5u, 0x3ef2aeddu, 0x3efd5384u,
0x3f016724u, 0x3f00f778u, 0x3efb389eu, 0x3ef0464du, 0x3ee3014fu, 0x3ed50a78u, 0x3ec779dbu, 0x3ebae363u));
__m512 result;
__m512 xr = _mm512_range_round_ps(x, thres, 2, _MM_FROUND_NO_EXC);
__m512 xa = _mm512_and_ps(xr, absmask);
__m512 index = _mm512_fmadd_ps(xa, scale, shifter);
__m512 c2 = _mm512_permutexvar_ps(_mm512_castps_si512(index), _c2);
__m512 c1 = _mm512_permutexvar_ps(_mm512_castps_si512(index), _c1);
__m512 c0 = _mm512_permutexvar_ps(_mm512_castps_si512(index), _c0);
__m512 poly = _mm512_fmadd_ps(c2, xa, c1);
poly = _mm512_fmadd_ps(poly, xa, c0);
result = _mm512_mul_ps(x, _mm512_fmadd_ps(poly, xr, half));
return result;
}
#endif /*defined(LIBXSMM_INTRINSICS_AVX512_CORE)*/
#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512DQ__ needed*/
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) __m512 LIBXSMM_INTRINSICS_MM512_GELU_BWD_PS_MINIMAX3(__m512 x) {
const __m512 thres = _mm512_castsi512_ps(_mm512_set1_epi32(0x408f5fff));
const __m512 absmask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff));
const __m512 scale = _mm512_castsi512_ps(_mm512_set1_epi32(0x405d67c9));
const __m512 shifter = _mm512_castsi512_ps(_mm512_set1_epi32(0x4b400000));
const __m512 half = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f000000));
const __m512 _c2 = _mm512_castsi512_ps(_mm512_setr_epi32(0xbe87047bu, 0xbe6eb875u, 0xbe2210c1u, 0xbd81727fu, 0x3cb9625cu, 0x3da2cbe8u, 0x3dd1d4d1u, 0x3dca0bd0u,
0x3da47dd0u, 0x3d6f1bd3u, 0x3d216381u, 0x3cd2618cu, 0x3c89f6e6u, 0x3c3ca672u, 0x3c08ed08u, 0x3bd26a14u));
const __m512 _c1 = _mm512_castsi512_ps(_mm512_setr_epi32(0xb930e738u, 0xbc4b28bau, 0xbda4212fu, 0xbe5feb0eu, 0xbec8b0e5u, 0xbf09e61bu, 0xbf1c403fu, 0xbf185954u,
0xbf03e1eeu, 0xbed08a61u, 0xbe9b4508u, 0xbe61788bu, 0xbe257770u, 0xbdfc542au, 0xbdca014eu, 0xbda8d7e9u));
const __m512 _c0 = _mm512_castsi512_ps(_mm512_setr_epi32(0x3f4c4245u, 0x3f4c927bu, 0x3f5085f8u, 0x3f5d7bdau, 0x3f73ea12u, 0x3f86142fu, 0x3f8d3df4u, 0x3f8b4b0fu,
0x3f8022c8u, 0x3f5e5423u, 0x3f39ceb5u, 0x3f199bedu, 0x3f00bee0u, 0x3ede1737u, 0x3ec59b86u, 0x3eb4454cu));
__m512 result;
__m512 xr = _mm512_range_round_ps(x, thres, 2, _MM_FROUND_NO_EXC);
__m512 xa = _mm512_and_ps(xr, absmask);
__m512 index = _mm512_fmadd_ps(xa, scale, shifter);
__m512 c2 = _mm512_permutexvar_ps(_mm512_castps_si512(index), _c2);
__m512 c1 = _mm512_permutexvar_ps(_mm512_castps_si512(index), _c1);
__m512 c0 = _mm512_permutexvar_ps(_mm512_castps_si512(index), _c0);
__m512 poly = _mm512_fmadd_ps(c2, xa, c1);
poly = _mm512_fmadd_ps(poly, xa, c0);
result = _mm512_fmadd_ps(poly, xr, half);
return result;
}
#endif /*defined(LIBXSMM_INTRINSICS_AVX512_CORE)*/
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_GELU_FWD(__m512 x) {
const __m512 c1 = _mm512_set1_ps( (float)0.79788);
const __m512 c2 = _mm512_set1_ps( (float)0.03568);
const __m512 c_half = _mm512_set1_ps( (float)0.5);
__m512 x_half = _mm512_mul_ps( x, c_half );
__m512 x_sq = _mm512_mul_ps( x, x );
__m512 poly_x1 = _mm512_mul_ps(x, _mm512_fmadd_ps( x_sq, c2, c1));
__m512 tanh_poly_x = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(poly_x1);
__m512 output = _mm512_fmadd_ps(tanh_poly_x, x_half, x_half);
return output;
}
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_GELU_BWD(__m512 x) {
const __m512 c1 = _mm512_set1_ps( (float)0.79788);
const __m512 c2 = _mm512_set1_ps( (float)0.03568);
const __m512 c3 = _mm512_set1_ps( (float)0.05352);
const __m512 c4 = _mm512_set1_ps( (float)0.39894);
const __m512 c_half = _mm512_set1_ps( (float)0.5);
const __m512 c_ones = _mm512_set1_ps( (float)1.0);
const __m512 c_minus_1 = _mm512_set1_ps( (float)-1.0);
__m512 x_sq = _mm512_mul_ps( x, x );
__m512 poly_x1 = _mm512_mul_ps(x, _mm512_fmadd_ps( x_sq, c2, c1));
__m512 poly_x2 = _mm512_mul_ps(x, _mm512_fmadd_ps( x_sq, c3, c4));
__m512 tanh_poly_x = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(poly_x1);
__m512 out1 = _mm512_add_ps(c_ones, tanh_poly_x);
__m512 out2 = _mm512_add_ps(c_half, poly_x2);
__m512 out3 = _mm512_fmsub_ps(poly_x2, tanh_poly_x, out2);
__m512 out4 = _mm512_mul_ps(c_minus_1, out3);
__m512 output = _mm512_mul_ps(out1, out4);
return output;
}
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_EXP_PS_2DTS(__m512 in) {
const __m512 log2_e = _mm512_set1_ps(1.442695f);
const __m512 half = _mm512_set1_ps(0.5f);
const __m512 c2 = _mm512_set1_ps(0.240226507f);
const __m512 c1 = _mm512_set1_ps(0.452920674f);
const __m512 c0 = _mm512_set1_ps(0.713483036f);
const __m512 x = _mm512_fmadd_ps(in, log2_e, half);
#if 1
const __m512 y = _mm512_sub_ps(x, _mm512_roundscale_round_ps(x, 1, _MM_FROUND_CUR_DIRECTION));
#else
const __m512 y = _mm512_reduce_ps(x, 1);
#endif
const __m512 t1 = _mm512_fmadd_ps( y, c2, c1);
const __m512 two_to_y = _mm512_fmadd_ps( y, t1, c0);
const __m512 exp = _mm512_scalef_ps( two_to_y, x );
return exp;
}
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_EXP_PS_3DTS(__m512 in) {
const __m512 log2_e = _mm512_set1_ps(1.442695f);
const __m512 half = _mm512_set1_ps(0.5f);
const __m512 c3 = _mm512_set1_ps(0.05550410866f);
const __m512 c2 = _mm512_set1_ps(0.15697034396f);
const __m512 c1 = _mm512_set1_ps(0.49454875509f);
const __m512 c0 = _mm512_set1_ps(0.70654502287f);
const __m512 x = _mm512_fmadd_ps(in, log2_e, half);
#if 1
const __m512 y = _mm512_sub_ps(x, _mm512_roundscale_round_ps(x, 1, _MM_FROUND_CUR_DIRECTION));
#else
const __m512 y = _mm512_reduce_ps(x, 1);
#endif
const __m512 t1 = _mm512_fmadd_ps( y, c3, c2);
const __m512 t2 = _mm512_fmadd_ps( y, t1, c1);
const __m512 two_to_y = _mm512_fmadd_ps( y, t2, c0);
const __m512 exp = _mm512_scalef_ps( two_to_y, x );
return exp;
}
# if defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && 0
LIBXSMM_PRAGMA_OPTIMIZE_OFF /* avoid ICE in case of symbols (-g) */
# endif
/** Generate random number in the interval [0, 1); not thread-safe.
* this is based on xoshiro128+ 1.0, e.g. http://prng.di.unimi.it/xoshiro128plus.c */
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512i LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EPI32(void) {
const __m512i result = _mm512_add_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(0), LIBXSMM_INTRINSICS_MM512_RNG_STATE(3));
const __m512i s = _mm512_slli_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(1), 9);
__m512i t;
LIBXSMM_INTRINSICS_MM512_RNG_STATE(2) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(2), LIBXSMM_INTRINSICS_MM512_RNG_STATE(0));
LIBXSMM_INTRINSICS_MM512_RNG_STATE(3) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(3), LIBXSMM_INTRINSICS_MM512_RNG_STATE(1));
LIBXSMM_INTRINSICS_MM512_RNG_STATE(1) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(1), LIBXSMM_INTRINSICS_MM512_RNG_STATE(2));
LIBXSMM_INTRINSICS_MM512_RNG_STATE(0) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(0), LIBXSMM_INTRINSICS_MM512_RNG_STATE(3));
LIBXSMM_INTRINSICS_MM512_RNG_STATE(2) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(2), s);
t = _mm512_slli_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(3), 11);
LIBXSMM_INTRINSICS_MM512_RNG_STATE(3) = _mm512_or_epi32(t, _mm512_srli_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(3), 32 - 11));
return result;
}
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_RNG_PS(void) {
const __m512i rng_mantissa = _mm512_srli_epi32( LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EPI32(), 9 );
const __m512 one = _mm512_set1_ps(1.0f);
return _mm512_sub_ps(_mm512_castsi512_ps(_mm512_or_epi32(_mm512_set1_epi32(0x3f800000), rng_mantissa)), one);
}
/** Generate random number in the interval [0, 1); thread save, state needs to be managed by user.
* this is based on xoshiro128+ 1.0, e.g. http://prng.di.unimi.it/xoshiro128plus.c */
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512i LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EXTSTATE_EPI32(unsigned int* stateptr) {
__m512i state_0 = _mm512_loadu_si512( stateptr );
__m512i state_1 = _mm512_loadu_si512( stateptr+16 );
__m512i state_2 = _mm512_loadu_si512( stateptr+32 );
__m512i state_3 = _mm512_loadu_si512( stateptr+48 );
const __m512i result = _mm512_add_epi32(state_0, state_3);
const __m512i s = _mm512_slli_epi32(state_1, 9);
__m512i t;
state_2 = _mm512_xor_epi32(state_2, state_0);
state_3 = _mm512_xor_epi32(state_3, state_1);
state_1 = _mm512_xor_epi32(state_1, state_2);
state_0 = _mm512_xor_epi32(state_0, state_3);
state_2 = _mm512_xor_epi32(state_2, s);
_mm512_storeu_si512( stateptr , state_0 );
_mm512_storeu_si512( stateptr+16, state_1 );
_mm512_storeu_si512( stateptr+32, state_2 );
t = _mm512_slli_epi32(state_3, 11);
state_3 = _mm512_or_epi32(t, _mm512_srli_epi32(state_3, 32 - 11));
_mm512_storeu_si512( stateptr+48, state_3 );
return result;
}
LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_RNG_EXTSTATE_PS(unsigned int* stateptr) {
const __m512i rng_mantissa = _mm512_srli_epi32( LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EXTSTATE_EPI32(stateptr), 9 );
const __m512 one = _mm512_set1_ps(1.0f);
return _mm512_sub_ps(_mm512_castsi512_ps(_mm512_or_epi32(_mm512_set1_epi32(0x3f800000), rng_mantissa)), one);
}
# if defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && 0
LIBXSMM_PRAGMA_OPTIMIZE_ON
# endif
#endif /*__AVX512F__*/
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(pop)
#endif
#endif /*LIBXSMM_INTRINSICS_X86_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_MACROS_H
#define LIBXSMM_MACROS_H
#include "libxsmm_config.h"
/** Parameters the library was built for. */
#define LIBXSMM_CACHELINE LIBXSMM_CONFIG_CACHELINE
#define LIBXSMM_ALIGNMENT LIBXSMM_CONFIG_ALIGNMENT
#define LIBXSMM_MALLOC LIBXSMM_CONFIG_MALLOC
#define LIBXSMM_ILP64 LIBXSMM_CONFIG_ILP64
#define LIBXSMM_SYNC LIBXSMM_CONFIG_SYNC
#define LIBXSMM_JIT LIBXSMM_CONFIG_JIT
/** Parameters of GEMM domain (static kernels, etc). */
#define LIBXSMM_PREFETCH LIBXSMM_CONFIG_PREFETCH
#define LIBXSMM_MAX_MNK LIBXSMM_CONFIG_MAX_MNK
#define LIBXSMM_MAX_DIM LIBXSMM_CONFIG_MAX_DIM
#define LIBXSMM_MAX_M LIBXSMM_CONFIG_MAX_M
#define LIBXSMM_MAX_N LIBXSMM_CONFIG_MAX_N
#define LIBXSMM_MAX_K LIBXSMM_CONFIG_MAX_K
#define LIBXSMM_FLAGS LIBXSMM_CONFIG_FLAGS
#define LIBXSMM_ALPHA LIBXSMM_CONFIG_ALPHA
#define LIBXSMM_BETA LIBXSMM_CONFIG_BETA
/**
* Use "make PLATFORM=1" to disable platform checks.
* The platform check is to bail-out with an error
* message for an attempt to build an upstream package
* and subsequently to list LIBXSMM as "broken" on
* that platform.
* Note: successful compilation on an unsupported
* platform is desired, but only fallback code is
* present at best.
*/
#if !defined(LIBXSMM_PLATFORM_FORCE) && 0
# define LIBXSMM_PLATFORM_FORCE
#endif
#if !defined(LIBXSMM_PLATFORM_X86) && ( \
(defined(__x86_64__) && 0 != (__x86_64__)) || \
(defined(__amd64__) && 0 != (__amd64__)) || \
(defined(_M_X64) || defined(_M_AMD64)) || \
(defined(__i386__) && 0 != (__i386__)) || \
(defined(_M_IX86)))
# define LIBXSMM_PLATFORM_X86
#endif
#if !defined(LIBXSMM_PLATFORM_AARCH64) && \
(defined(__aarch64__) || defined(__arm64__))
# define LIBXSMM_PLATFORM_AARCH64
#endif
#if !defined(LIBXSMM_PLATFORM_SUPPORTED)
# if defined(LIBXSMM_PLATFORM_X86) || defined(LIBXSMM_PLATFORM_AARCH64)
# define LIBXSMM_PLATFORM_SUPPORTED
# elif !defined(LIBXSMM_PLATFORM_FORCE)
# error LIBXSMM requires X86_64, AArch64, or compatible CPUs!
# endif
#endif
#if !defined(LIBXSMM_BITS)
# if (defined(__SIZEOF_PTRDIFF_T__) && 4 < (__SIZEOF_PTRDIFF_T__)) || \
(defined(__SIZE_MAX__) && (4294967295U < (__SIZE_MAX__))) || \
(defined(__x86_64__) && 0 != (__x86_64__)) || \
(defined(__amd64__) && 0 != (__amd64__)) || \
(defined(_M_X64) || defined(_M_AMD64)) || \
(defined(_WIN64)) || \
(defined(__powerpc64)) || \
(defined(__aarch64__))
# define LIBXSMM_UNLIMITED 0xFFFFFFFFFFFFFFFF
# define LIBXSMM_BITS 64
# elif !defined(LIBXSMM_PLATFORM_FORCE) && defined(NDEBUG)
# error LIBXSMM is only supported on 64-bit platforms!
# else /* JIT-generated code (among other issues) is not supported! */
# define LIBXSMM_UNLIMITED 0xFFFFFFFF
# define LIBXSMM_BITS 32
# endif
#endif
#define LIBXSMM_STRINGIFY2(SYMBOL) #SYMBOL
#define LIBXSMM_STRINGIFY(SYMBOL) LIBXSMM_STRINGIFY2(SYMBOL)
#define LIBXSMM_TOSTRING(SYMBOL) LIBXSMM_STRINGIFY(SYMBOL)
#define LIBXSMM_CONCATENATE2(A, B) A##B
#define LIBXSMM_CONCATENATE3(A, B, C) LIBXSMM_CONCATENATE(LIBXSMM_CONCATENATE(A, B), C)
#define LIBXSMM_CONCATENATE4(A, B, C, D) LIBXSMM_CONCATENATE(LIBXSMM_CONCATENATE3(A, B, C), D)
#define LIBXSMM_CONCATENATE(A, B) LIBXSMM_CONCATENATE2(A, B)
#define LIBXSMM_FSYMBOL(SYMBOL) LIBXSMM_CONCATENATE(SYMBOL, _)
#define LIBXSMM_UNIQUE(NAME) LIBXSMM_CONCATENATE(NAME, __LINE__)
#define LIBXSMM_EXPAND(...) __VA_ARGS__
#define LIBXSMM_ELIDE(...)
/**
* Check given value against type-range (assertion).
* Note: allows "-1" for unsigned types.
*/
#if !defined(NDEBUG)
# define LIBXSMM_CHECK_ULLONG(VALUE) assert(-1 <= (VALUE) && (VALUE) <= ULLONG_MAX)
# define LIBXSMM_CHECK_LLONG(VALUE) assert(ULLONG_MIN <= (VALUE) && (VALUE) <= LLONG_MAX)
# define LIBXSMM_CHECK_ULONG(VALUE) assert(-1 <= (VALUE) && (VALUE) <= ULONG_MAX)
# define LIBXSMM_CHECK_LONG(VALUE) assert(LONG_MIN <= (VALUE) && (VALUE) <= LONG_MAX)
# define LIBXSMM_CHECK_USHORT(VALUE) assert(-1 <= (VALUE) && (VALUE) <= USHRT_MAX)
# define LIBXSMM_CHECK_SHORT(VALUE) assert(SHRT_MIN <= (VALUE) && (VALUE) <= SHRT_MAX)
# define LIBXSMM_CHECK_UCHAR(VALUE) assert(-1 <= (VALUE) && (VALUE) <= UCHAR_MAX)
# define LIBXSMM_CHECK_ICHAR(VALUE) assert(SCHAR_MIN <= (VALUE) && (VALUE) <= SCHAR_MAX)
# define LIBXSMM_CHECK_UINT(VALUE) assert(-1 <= (VALUE) && (VALUE) <= UINT_MAX)
# define LIBXSMM_CHECK_INT(VALUE) assert(INT_MIN <= (VALUE) && (VALUE) <= INT_MAX)
#else
# define LIBXSMM_CHECK_ULLONG(VALUE) 0/*dummy*/
# define LIBXSMM_CHECK_LLONG(VALUE) 0/*dummy*/
# define LIBXSMM_CHECK_ULONG(VALUE) 0/*dummy*/
# define LIBXSMM_CHECK_LONG(VALUE) 0/*dummy*/
# define LIBXSMM_CHECK_USHORT(VALUE) 0/*dummy*/
# define LIBXSMM_CHECK_SHORT(VALUE) 0/*dummy*/
# define LIBXSMM_CHECK_UCHAR(VALUE) 0/*dummy*/
# define LIBXSMM_CHECK_ICHAR(VALUE) 0/*dummy*/
# define LIBXSMM_CHECK_UINT(VALUE) 0/*dummy*/
# define LIBXSMM_CHECK_INT(VALUE) 0/*dummy*/
#endif
/**
* Perform verbose type-cast with following two advantages:
* (1) Make it easy to locate/find the type-cast.
* (2) Range-check to ensure fitting into type.
*/
#define LIBXSMM_CAST_ULLONG(VALUE) (LIBXSMM_CHECK_ULLONG(VALUE), (unsigned long long)(VALUE))
#define LIBXSMM_CAST_LLONG(VALUE) (LIBXSMM_CHECK_LLONG(VALUE), (/*signed*/long long)(VALUE))
#define LIBXSMM_CAST_ULONG(VALUE) (LIBXSMM_CHECK_ULONG(VALUE), (unsigned long)(VALUE))
#define LIBXSMM_CAST_LONG(VALUE) (LIBXSMM_CHECK_LONG(VALUE), (/*signed*/long)(VALUE))
#define LIBXSMM_CAST_USHORT(VALUE) (LIBXSMM_CHECK_USHORT(VALUE), (unsigned short)(VALUE))
#define LIBXSMM_CAST_SHORT(VALUE) (LIBXSMM_CHECK_SHORT(VALUE), (/*signed*/short)(VALUE))
#define LIBXSMM_CAST_UCHAR(VALUE) (LIBXSMM_CHECK_UCHAR(VALUE), (unsigned char)(VALUE))
#define LIBXSMM_CAST_ICHAR(VALUE) (LIBXSMM_CHECK_ICHAR(VALUE), (signed char)(VALUE))
#define LIBXSMM_CAST_UINT(VALUE) (LIBXSMM_CHECK_UINT(VALUE), (unsigned int)(VALUE))
#define LIBXSMM_CAST_INT(VALUE) (LIBXSMM_CHECK_INT(VALUE), (/*signed*/int)(VALUE))
/** Use LIBXSMM_VERSION2 instead of LIBXSMM_VERSION3, e.g., if __GNUC_PATCHLEVEL__ or __clang_patchlevel__ is zero (0). */
#define LIBXSMM_VERSION2(MAJOR, MINOR) ((MAJOR) * 10000 + (MINOR) * 100)
#define LIBXSMM_VERSION3(MAJOR, MINOR, UPDATE) (LIBXSMM_VERSION2(MAJOR, MINOR) + (UPDATE))
#define LIBXSMM_VERSION4(MAJOR, MINOR, UPDATE, PATCH) \
(((0x7F & (MAJOR)) << 24) | ((0x1F & (MINOR)) << 19) | ((0x1F & (UPDATE)) << 14) | (0x3FFF & (PATCH)))
#define LIBXSMM_VERSION41(VERSION) (((VERSION) >> 24))
#define LIBXSMM_VERSION42(VERSION) (((VERSION) >> 19) & 0x1F)
#define LIBXSMM_VERSION43(VERSION) (((VERSION) >> 14) & 0x1F)
#define LIBXSMM_VERSION44(VERSION) (((VERSION)) & 0x3FFF)
#if !defined(LIBXSMM_UNPACKED) && (defined(_CRAYC) || defined(LIBXSMM_OFFLOAD_BUILD) || \
(0 == LIBXSMM_SYNC)/*Windows: missing pack(pop) error*/)
# define LIBXSMM_UNPACKED
#endif
#if defined(_WIN32) && !defined(__GNUC__) && !defined(__clang__)
# define LIBXSMM_ATTRIBUTE(A) __declspec(A)
# if defined(__cplusplus)
# define LIBXSMM_INLINE_ALWAYS __forceinline
# else
# define LIBXSMM_INLINE_ALWAYS static __forceinline
# endif
# define LIBXSMM_ALIGNED(DECL, N) LIBXSMM_ATTRIBUTE(align(N)) DECL
# if !defined(LIBXSMM_UNPACKED)
# define LIBXSMM_PACKED(TYPE) LIBXSMM_PRAGMA(pack(1)) TYPE
# endif
# define LIBXSMM_CDECL __cdecl
#elif (defined(__GNUC__) || defined(__clang__) || defined(__PGI))
# define LIBXSMM_ATTRIBUTE(A) __attribute__((A))
# define LIBXSMM_INLINE_ALWAYS LIBXSMM_ATTRIBUTE(always_inline) LIBXSMM_INLINE
# define LIBXSMM_ALIGNED(DECL, N) LIBXSMM_ATTRIBUTE(aligned(N)) DECL
# if !defined(LIBXSMM_UNPACKED)
# define LIBXSMM_PACKED(TYPE) TYPE LIBXSMM_ATTRIBUTE(__packed__)
# endif
# define LIBXSMM_CDECL LIBXSMM_ATTRIBUTE(cdecl)
#else
# define LIBXSMM_ATTRIBUTE(A)
# define LIBXSMM_INLINE_ALWAYS LIBXSMM_INLINE
# define LIBXSMM_ALIGNED(DECL, N) DECL
# define LIBXSMM_CDECL
#endif
#if !defined(LIBXSMM_PACKED)
# define LIBXSMM_PACKED(TYPE) TYPE
# if !defined(LIBXSMM_UNPACKED)
# define LIBXSMM_UNPACKED
# endif
#endif
#if !defined(LIBXSMM_UNPACKED) && 0
/* no braces around EXPR */
# define LIBXSMM_PAD(EXPR) EXPR;
#endif
#if !defined(LIBXSMM_PAD)
# define LIBXSMM_PAD(EXPR)
#endif
#if defined(__INTEL_COMPILER)
# if !defined(__INTEL_COMPILER_UPDATE)
# define LIBXSMM_INTEL_COMPILER __INTEL_COMPILER
# else
# define LIBXSMM_INTEL_COMPILER (__INTEL_COMPILER + __INTEL_COMPILER_UPDATE)
# endif
#elif defined(__INTEL_COMPILER_BUILD_DATE)
# define LIBXSMM_INTEL_COMPILER ((__INTEL_COMPILER_BUILD_DATE / 10000 - 2000) * 100)
#endif
/* LIBXSMM_ATTRIBUTE_USED: mark library functions as used to avoid warning */
#if defined(__GNUC__) || defined(__clang__) || (defined(__INTEL_COMPILER) && !defined(_WIN32))
# if !defined(__cplusplus) || !defined(__clang__)
# define LIBXSMM_ATTRIBUTE_COMMON LIBXSMM_ATTRIBUTE(common)
# else
# define LIBXSMM_ATTRIBUTE_COMMON
# endif
# define LIBXSMM_ATTRIBUTE_MALLOC LIBXSMM_ATTRIBUTE(malloc)
# define LIBXSMM_ATTRIBUTE_UNUSED LIBXSMM_ATTRIBUTE(unused)
# define LIBXSMM_ATTRIBUTE_USED LIBXSMM_ATTRIBUTE(used)
#else
# if defined(_WIN32)
# define LIBXSMM_ATTRIBUTE_COMMON LIBXSMM_ATTRIBUTE(selectany)
# else
# define LIBXSMM_ATTRIBUTE_COMMON
# endif
# define LIBXSMM_ATTRIBUTE_MALLOC
# define LIBXSMM_ATTRIBUTE_UNUSED
# define LIBXSMM_ATTRIBUTE_USED
#endif
#if defined(__clang__) && !defined(__INTEL_COMPILER)
# define LIBXSMM_ATTRIBUTE_NO_SANITIZE(KIND) LIBXSMM_ATTRIBUTE(no_sanitize(LIBXSMM_STRINGIFY(KIND)))
#elif defined(__GNUC__) && LIBXSMM_VERSION2(4, 8) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) \
&& !defined(__INTEL_COMPILER)
# define LIBXSMM_ATTRIBUTE_NO_SANITIZE(KIND) LIBXSMM_ATTRIBUTE(LIBXSMM_CONCATENATE(no_sanitize_, KIND))
#else
# define LIBXSMM_ATTRIBUTE_NO_SANITIZE(KIND)
#endif
#if defined(__cplusplus)
# define LIBXSMM_VARIADIC ...
# define LIBXSMM_EXTERN extern "C"
# define LIBXSMM_EXTERN_C extern "C"
# define LIBXSMM_INLINE_KEYWORD inline
# define LIBXSMM_INLINE LIBXSMM_INLINE_KEYWORD
# if defined(__GNUC__) || defined(_CRAYC)
# define LIBXSMM_CALLER __PRETTY_FUNCTION__
# elif defined(_MSC_VER)
# define LIBXSMM_CALLER __FUNCDNAME__
# define LIBXSMM_FUNCNAME __FUNCTION__
# else
# define LIBXSMM_CALLER __FUNCNAME__
# endif
#else /* C */
# define LIBXSMM_VARIADIC
# define LIBXSMM_EXTERN extern
# define LIBXSMM_EXTERN_C
# if defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__) /*C99*/
# define LIBXSMM_PRAGMA(DIRECTIVE) _Pragma(LIBXSMM_STRINGIFY(DIRECTIVE))
# define LIBXSMM_CALLER __func__
# define LIBXSMM_RESTRICT restrict
# define LIBXSMM_INLINE_KEYWORD inline
# elif defined(_MSC_VER)
# define LIBXSMM_CALLER __FUNCDNAME__
# define LIBXSMM_FUNCNAME __FUNCTION__
# define LIBXSMM_INLINE_KEYWORD __inline
# define LIBXSMM_INLINE_FIXUP
# elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
# define LIBXSMM_CALLER __PRETTY_FUNCTION__
# endif
# if !defined(LIBXSMM_INLINE_KEYWORD)
# define LIBXSMM_INLINE_KEYWORD
# define LIBXSMM_INLINE_FIXUP
# endif
/* LIBXSMM_ATTRIBUTE_USED: increases compile-time of header-only by a large factor */
# define LIBXSMM_INLINE static LIBXSMM_INLINE_KEYWORD LIBXSMM_ATTRIBUTE_UNUSED
#endif /*__cplusplus*/
#if !defined(LIBXSMM_CALLER)
# define LIBXSMM_CALLER NULL
#endif
#if !defined(LIBXSMM_FUNCNAME)
# define LIBXSMM_FUNCNAME LIBXSMM_CALLER
#endif
#if !defined(LIBXSMM_CALLER_ID)
# if defined(__GNUC__) || 1
# define LIBXSMM_CALLER_ID ((const void*)((uintptr_t)libxsmm_hash_string(LIBXSMM_CALLER)))
# else /* assume no string-pooling (perhaps unsafe) */
# define LIBXSMM_CALLER_ID LIBXSMM_CALLER
# endif
#endif
#if defined(LIBXSMM_OFFLOAD_BUILD) && \
defined(__INTEL_OFFLOAD) && (!defined(_WIN32) || (1400 <= LIBXSMM_INTEL_COMPILER))
# define LIBXSMM_OFFLOAD(A) LIBXSMM_ATTRIBUTE(target(A))
# define LIBXSMM_NO_OFFLOAD(RTYPE, FN, ...) ((RTYPE (*)(LIBXSMM_VARIADIC))(FN))(__VA_ARGS__)
# if !defined(LIBXSMM_OFFLOAD_TARGET)
# define LIBXSMM_OFFLOAD_TARGET mic
# endif
#else
# define LIBXSMM_OFFLOAD(A)
# define LIBXSMM_NO_OFFLOAD(RTYPE, FN, ...) (FN)(__VA_ARGS__)
#endif
#define LIBXSMM_RETARGETABLE LIBXSMM_OFFLOAD(LIBXSMM_OFFLOAD_TARGET)
#if !defined(__STATIC) && !defined(_WINDLL) && (defined(_WIN32) || defined(__CYGWIN__) || defined(__MINGW32__))
# define __STATIC
#endif
/* may include Clang and other compatible compilers */
#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MINGW32__)
# define LIBXSMM_VISIBILITY_INTERNAL LIBXSMM_ATTRIBUTE(visibility("internal"))
# define LIBXSMM_VISIBILITY_HIDDEN LIBXSMM_ATTRIBUTE(visibility("hidden"))
# define LIBXSMM_VISIBILITY_PUBLIC LIBXSMM_ATTRIBUTE(visibility("default"))
#endif
#if !defined(LIBXSMM_VISIBILITY_INTERNAL)
# define LIBXSMM_VISIBILITY_INTERNAL
#endif
#if !defined(LIBXSMM_VISIBILITY_HIDDEN)
# define LIBXSMM_VISIBILITY_HIDDEN
#endif
#if !defined(LIBXSMM_VISIBILITY_PUBLIC)
# define LIBXSMM_VISIBILITY_PUBLIC
#endif
#if !defined(LIBXSMM_VISIBILITY_PRIVATE)
# define LIBXSMM_VISIBILITY_PRIVATE LIBXSMM_VISIBILITY_HIDDEN
#endif
/* Windows Dynamic Link Library (DLL) */
#if !defined(__STATIC) && (defined(_WIN32) || defined(__CYGWIN__) || defined(__MINGW32__))
# define LIBXSMM_VISIBILITY_EXPORT LIBXSMM_ATTRIBUTE(dllexport)
# define LIBXSMM_VISIBILITY_IMPORT LIBXSMM_ATTRIBUTE(dllimport)
#endif
#if !defined(LIBXSMM_VISIBILITY_EXPORT)
# define LIBXSMM_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_PUBLIC
#endif
#if !defined(LIBXSMM_VISIBILITY_IMPORT)
# define LIBXSMM_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_PUBLIC
#endif
#if defined(LIBXSMM_SOURCE_H) /* header-only mode */
# define LIBXSMM_API_VISIBILITY_EXPORT
# define LIBXSMM_API_VISIBILITY_IMPORT
# define LIBXSMM_API_VISIBILITY_INTERN
# define LIBXSMM_API_COMMON LIBXSMM_RETARGETABLE LIBXSMM_ATTRIBUTE_COMMON
# define LIBXSMM_API_TARGET LIBXSMM_API_INLINE
# define LIBXSMM_API_EXTERN LIBXSMM_EXTERN_C
#else /* classic ABI */
# if defined(LIBXSMM_BUILD_EXT)
# define LIBXSMM_API_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_IMPORT
# define LIBXSMM_API_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_EXPORT
# define LIBXSMM_API_VISIBILITY_INTERN LIBXSMM_VISIBILITY_PRIVATE
# elif defined(LIBXSMM_BUILD)
# define LIBXSMM_API_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_EXPORT
# define LIBXSMM_API_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_IMPORT
# define LIBXSMM_API_VISIBILITY_INTERN LIBXSMM_VISIBILITY_PRIVATE
# else /* import */
# define LIBXSMM_API_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_IMPORT
# define LIBXSMM_API_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_IMPORT
# define LIBXSMM_API_VISIBILITY_INTERN
# endif
# define LIBXSMM_API_COMMON LIBXSMM_RETARGETABLE
# define LIBXSMM_API_TARGET LIBXSMM_RETARGETABLE
# define LIBXSMM_API_EXTERN LIBXSMM_EXTERN
#endif
#define LIBXSMM_API_VISIBILITY(VISIBILITY) LIBXSMM_CONCATENATE(LIBXSMM_API_VISIBILITY_, VISIBILITY)
#define LIBXSMM_APIVAR(DECL, VISIBILITY, EXTERN) EXTERN LIBXSMM_API_COMMON LIBXSMM_API_VISIBILITY(VISIBILITY) DECL
#define LIBXSMM_API_INLINE LIBXSMM_INLINE LIBXSMM_RETARGETABLE
#define LIBXSMM_API_DEF
#if (!defined(__INTEL_COMPILER) || !defined(_WIN32))
#define LIBXSMM_APIVAR_ALIGNED(DECL, VISIBILITY) LIBXSMM_ALIGNED(LIBXSMM_APIVAR(DECL, VISIBILITY, LIBXSMM_API_DEF), LIBXSMM_CONFIG_CACHELINE)
#else
#define LIBXSMM_APIVAR_ALIGNED(DECL, VISIBILITY) LIBXSMM_APIVAR(DECL, VISIBILITY, LIBXSMM_API_DEF)
#endif
/** Public variable declaration (without definition) located in header file. */
#define LIBXSMM_APIVAR_PUBLIC(DECL) LIBXSMM_APIVAR(DECL, EXPORT, LIBXSMM_API_EXTERN)
/** Public variable definition (complements declaration) located in source file. */
#define LIBXSMM_APIVAR_PUBLIC_DEF(DECL) LIBXSMM_APIVAR_ALIGNED(DECL, EXPORT)
/** Private variable declaration (without definition) located in header file. */
#define LIBXSMM_APIVAR_PRIVATE(DECL) LIBXSMM_APIVAR(DECL, INTERN, LIBXSMM_API_EXTERN)
/** Private variable definition (complements declaration) located in source file. */
#define LIBXSMM_APIVAR_PRIVATE_DEF(DECL) LIBXSMM_APIVAR_ALIGNED(DECL, INTERN)
/** Private variable (declaration and definition) located in source file. */
#define LIBXSMM_APIVAR_DEFINE(DECL) LIBXSMM_APIVAR_PRIVATE(DECL); LIBXSMM_APIVAR_PRIVATE_DEF(DECL)
/** Function decoration used for private functions. */
#define LIBXSMM_API_INTERN LIBXSMM_API_EXTERN LIBXSMM_API_TARGET LIBXSMM_API_VISIBILITY(INTERN)
/** Function decoration used for public functions of LIBXSMMext library. */
#define LIBXSMM_APIEXT LIBXSMM_API_EXTERN LIBXSMM_API_TARGET LIBXSMM_API_VISIBILITY(IMPORT)
/** Function decoration used for public functions of LIBXSMM library. */
#define LIBXSMM_API LIBXSMM_API_EXTERN LIBXSMM_API_TARGET LIBXSMM_API_VISIBILITY(EXPORT)
#if !defined(LIBXSMM_RESTRICT)
# if ((defined(__GNUC__) && !defined(__CYGWIN32__)) || defined(LIBXSMM_INTEL_COMPILER)) && !defined(_WIN32)
# define LIBXSMM_RESTRICT __restrict__
# elif defined(_MSC_VER) || defined(LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_RESTRICT __restrict
# else
# define LIBXSMM_RESTRICT
# endif
#endif /*LIBXSMM_RESTRICT*/
#if !defined(LIBXSMM_PRAGMA)
# if defined(LIBXSMM_INTEL_COMPILER) || defined(_MSC_VER)
# define LIBXSMM_PRAGMA(DIRECTIVE) __pragma(LIBXSMM_EXPAND(DIRECTIVE))
# else
# define LIBXSMM_PRAGMA(DIRECTIVE)
# endif
#endif /*LIBXSMM_PRAGMA*/
#if !defined(LIBXSMM_OPENMP_SIMD) && (defined(_OPENMP) && (201307 <= _OPENMP/*v4.0*/))
# if defined(LIBXSMM_INTEL_COMPILER)
# if (1500 <= LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_OPENMP_SIMD
# endif
# elif defined(__GNUC__)
# if LIBXSMM_VERSION2(4, 9) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)
# define LIBXSMM_OPENMP_SIMD
# endif
# else
# define LIBXSMM_OPENMP_SIMD
# endif
#endif
#if !defined(LIBXSMM_INTEL_COMPILER) || (LIBXSMM_INTEL_COMPILER < 9900)
# if defined(LIBXSMM_OPENMP_SIMD)
# define LIBXSMM_PRAGMA_SIMD_REDUCTION(EXPRESSION) LIBXSMM_PRAGMA(omp simd reduction(EXPRESSION))
# define LIBXSMM_PRAGMA_SIMD_COLLAPSE(N) LIBXSMM_PRAGMA(omp simd collapse(N))
# define LIBXSMM_PRAGMA_SIMD_PRIVATE(...) LIBXSMM_PRAGMA(omp simd private(__VA_ARGS__))
# define LIBXSMM_PRAGMA_SIMD LIBXSMM_PRAGMA(omp simd)
# elif defined(__INTEL_COMPILER)
# define LIBXSMM_PRAGMA_SIMD_REDUCTION(EXPRESSION) LIBXSMM_PRAGMA(simd reduction(EXPRESSION))
# define LIBXSMM_PRAGMA_SIMD_COLLAPSE(N) LIBXSMM_PRAGMA(simd collapse(N))
# define LIBXSMM_PRAGMA_SIMD_PRIVATE(...) LIBXSMM_PRAGMA(simd private(__VA_ARGS__))
# define LIBXSMM_PRAGMA_SIMD LIBXSMM_PRAGMA(simd)
# endif
#endif
#if !defined(LIBXSMM_PRAGMA_SIMD)
# define LIBXSMM_PRAGMA_SIMD_REDUCTION(EXPRESSION)
# define LIBXSMM_PRAGMA_SIMD_COLLAPSE(N)
# define LIBXSMM_PRAGMA_SIMD_PRIVATE(...)
# define LIBXSMM_PRAGMA_SIMD
#endif
#if defined(__INTEL_COMPILER)
# define LIBXSMM_PRAGMA_NONTEMPORAL(...) LIBXSMM_PRAGMA(vector nontemporal(__VA_ARGS__))
# define LIBXSMM_PRAGMA_VALIGNED LIBXSMM_PRAGMA(vector aligned)
# define LIBXSMM_PRAGMA_NOVECTOR LIBXSMM_PRAGMA(novector)
# define LIBXSMM_PRAGMA_FORCEINLINE LIBXSMM_PRAGMA(forceinline)
# define LIBXSMM_PRAGMA_LOOP_COUNT(MIN, MAX, AVG) LIBXSMM_PRAGMA(loop_count min=MIN max=MAX avg=AVG)
# define LIBXSMM_PRAGMA_UNROLL_AND_JAM(N) LIBXSMM_PRAGMA(unroll_and_jam(N))
# define LIBXSMM_PRAGMA_UNROLL_N(N) LIBXSMM_PRAGMA(unroll(N))
# define LIBXSMM_PRAGMA_UNROLL LIBXSMM_PRAGMA(unroll)
# define LIBXSMM_PRAGMA_VALIGNED_VAR(A) LIBXSMM_ASSUME_ALIGNED(A, LIBXSMM_ALIGNMENT);
/*# define LIBXSMM_UNUSED(VARIABLE) LIBXSMM_PRAGMA(unused(VARIABLE))*/
#else
# if defined(LIBXSMM_OPENMP_SIMD) && (201811 <= _OPENMP/*v5.0*/)
# define LIBXSMM_PRAGMA_NONTEMPORAL(...) LIBXSMM_PRAGMA(omp simd nontemporal(__VA_ARGS__))
# else
# define LIBXSMM_PRAGMA_NONTEMPORAL(...)
# endif
# if defined(__clang__)
# define LIBXSMM_PRAGMA_VALIGNED_VAR(A)
# define LIBXSMM_PRAGMA_VALIGNED
# define LIBXSMM_PRAGMA_NOVECTOR LIBXSMM_PRAGMA(clang loop vectorize(disable))
# define LIBXSMM_PRAGMA_FORCEINLINE
# define LIBXSMM_PRAGMA_LOOP_COUNT(MIN, MAX, AVG) LIBXSMM_PRAGMA(unroll(AVG))
# define LIBXSMM_PRAGMA_UNROLL_AND_JAM(N) LIBXSMM_PRAGMA(unroll(N))
# define LIBXSMM_PRAGMA_UNROLL_N(N) LIBXSMM_PRAGMA(unroll(N))
# define LIBXSMM_PRAGMA_UNROLL LIBXSMM_PRAGMA_UNROLL_N(4)
# else
# define LIBXSMM_PRAGMA_VALIGNED_VAR(A)
# define LIBXSMM_PRAGMA_VALIGNED
# define LIBXSMM_PRAGMA_NOVECTOR
# define LIBXSMM_PRAGMA_FORCEINLINE
# define LIBXSMM_PRAGMA_LOOP_COUNT(MIN, MAX, AVG)
# define LIBXSMM_PRAGMA_UNROLL_AND_JAM(N)
# define LIBXSMM_PRAGMA_UNROLL
# endif
#endif
#if !defined(LIBXSMM_PRAGMA_UNROLL_N)
# if defined(__GNUC__) && (LIBXSMM_VERSION2(8, 3) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))
# define LIBXSMM_PRAGMA_UNROLL_N(N) LIBXSMM_PRAGMA(GCC unroll N)
# else
# define LIBXSMM_PRAGMA_UNROLL_N(N)
# endif
#endif
#if defined(LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_PRAGMA_OPTIMIZE_OFF LIBXSMM_PRAGMA(optimize("", off))
# define LIBXSMM_PRAGMA_OPTIMIZE_ON LIBXSMM_PRAGMA(optimize("", on))
#elif defined(__clang__)
# define LIBXSMM_PRAGMA_OPTIMIZE_OFF LIBXSMM_PRAGMA(clang optimize off)
# define LIBXSMM_PRAGMA_OPTIMIZE_ON LIBXSMM_PRAGMA(clang optimize on)
#elif defined(__GNUC__)
# define LIBXSMM_PRAGMA_OPTIMIZE_OFF LIBXSMM_PRAGMA(GCC push_options) LIBXSMM_PRAGMA(GCC optimize("O0"))
# define LIBXSMM_PRAGMA_OPTIMIZE_ON LIBXSMM_PRAGMA(GCC pop_options)
#else
# define LIBXSMM_PRAGMA_OPTIMIZE_OFF
# define LIBXSMM_PRAGMA_OPTIMIZE_ON
#endif
#if defined(_OPENMP) && (200805 <= _OPENMP/*v3.0*/) \
&& defined(NDEBUG) /* CCE complains for debug builds */
# define LIBXSMM_OPENMP_COLLAPSE(N) collapse(N)
#else
# define LIBXSMM_OPENMP_COLLAPSE(N)
#endif
/** LIBXSMM_UP2POT rounds up to the next power of two (POT). */
#define LIBXSMM_UP2POT_01(N) ((N) | ((N) >> 1))
#define LIBXSMM_UP2POT_02(N) (LIBXSMM_UP2POT_01(N) | (LIBXSMM_UP2POT_01(N) >> 2))
#define LIBXSMM_UP2POT_04(N) (LIBXSMM_UP2POT_02(N) | (LIBXSMM_UP2POT_02(N) >> 4))
#define LIBXSMM_UP2POT_08(N) (LIBXSMM_UP2POT_04(N) | (LIBXSMM_UP2POT_04(N) >> 8))
#define LIBXSMM_UP2POT_16(N) (LIBXSMM_UP2POT_08(N) | (LIBXSMM_UP2POT_08(N) >> 16))
#define LIBXSMM_UP2POT_32(N) (LIBXSMM_UP2POT_16(N) | (LIBXSMM_UP2POT_16(N) >> 32))
#define LIBXSMM_UP2POT(N) (LIBXSMM_UP2POT_32((unsigned long long)(N) - LIBXSMM_MIN(1, N)) + LIBXSMM_MIN(1, N))
#define LIBXSMM_LO2POT(N) (LIBXSMM_UP2POT_32((unsigned long long)(N) >> 1) + LIBXSMM_MIN(1, N))
#define LIBXSMM_UPDIV(N, MULT) (((N) + ((MULT) - 1)) / (MULT))
#define LIBXSMM_UP(N, MULT) (LIBXSMM_UPDIV(N, MULT) * (MULT))
#define LIBXSMM_UP2(N, NPOT) (((N) + ((NPOT) - 1)) & ~((NPOT) - 1))
#define LIBXSMM_ABS(A) (0 <= (A) ? (A) : -(A))
#define LIBXSMM_MIN(A, B) ((A) < (B) ? (A) : (B))
#define LIBXSMM_MAX(A, B) ((A) < (B) ? (B) : (A))
#define LIBXSMM_MOD(A, N) ((A) % (N))
#define LIBXSMM_MOD2(A, NPOT) ((A) & ((NPOT) - 1))
#define LIBXSMM_DELTA(T0, T1) ((T0) < (T1) ? ((T1) - (T0)) : ((T0) - (T1)))
#define LIBXSMM_CLMP(VALUE, LO, HI) ((LO) < (VALUE) ? ((VALUE) <= (HI) ? (VALUE) : LIBXSMM_MIN(VALUE, HI)) : LIBXSMM_MAX(LO, VALUE))
#define LIBXSMM_SIZEOF(START, LAST) (((const char*)(LAST)) - ((const char*)(START)) + sizeof(*LAST))
#define LIBXSMM_FEQ(A, B) ((A) == (B))
#define LIBXSMM_NEQ(A, B) ((A) != (B))
#define LIBXSMM_ISPOT(A) (0 != (A) && !((A) & ((A) - 1)))
#define LIBXSMM_ISWAP(A, B) (((A) ^= (B)), ((B) ^= (A)), ((A) ^= (B)))
#define LIBXSMM_ISNAN(A) LIBXSMM_NEQ(A, A)
#define LIBXSMM_NOTNAN(A) LIBXSMM_FEQ(A, A)
#define LIBXSMM_ROUNDX(TYPE, A) ((TYPE)((long long)(0 <= (A) ? ((double)(A) + 0.5) : ((double)(A) - 0.5))))
#define LIBXSMM_CONST_VOID_PTR(A) *((const void**)&(A))
/** Makes some functions available independent of C99 support. */
#if defined(__STDC_VERSION__) && (199901L/*C99*/ <= __STDC_VERSION__)
# if defined(__PGI)
# define LIBXSMM_POWF(A, B) ((float)pow((float)(A), (float)(B)))
# else
# define LIBXSMM_POWF(A, B) powf(A, B)
# endif
# define LIBXSMM_FREXPF(A, B) frexpf(A, B)
# define LIBXSMM_ROUNDF(A) roundf(A)
# define LIBXSMM_ROUND(A) round(A)
# define LIBXSMM_TANHF(A) tanhf(A)
# define LIBXSMM_SQRTF(A) sqrtf(A)
# define LIBXSMM_EXP2F(A) exp2f(A)
# define LIBXSMM_LOG2F(A) log2f(A)
# define LIBXSMM_ERFF(A) erff(A)
# define LIBXSMM_EXP2(A) exp2(A)
# define LIBXSMM_LOG2(A) log2(A)
# define LIBXSMM_EXPF(A) expf(A)
# define LIBXSMM_LOGF(A) logf(A)
#else
# define LIBXSMM_POWF(A, B) ((float)pow((float)(A), (float)(B)))
# define LIBXSMM_FREXPF(A, B) ((float)frexp((float)(A), B))
# define LIBXSMM_ROUNDF(A) LIBXSMM_ROUNDX(float, A)
# define LIBXSMM_ROUND(A) LIBXSMM_ROUNDX(double, A)
# define LIBXSMM_TANHF(A) ((float)tanh((float)(A)))
# define LIBXSMM_SQRTF(A) ((float)sqrt((float)(A)))
# define LIBXSMM_EXP2F(A) LIBXSMM_POWF(2, A)
# define LIBXSMM_LOG2F(A) ((float)LIBXSMM_LOG2((float)(A)))
# define LIBXSMM_ERFF(A) ((float)erf((float)(A)))
# define LIBXSMM_EXP2(A) pow(2.0, A)
# define LIBXSMM_LOG2(A) (log(A) * (1.0 / (M_LN2)))
# define LIBXSMM_EXPF(A) ((float)exp((float)(A)))
# define LIBXSMM_LOGF(A) ((float)log((float)(A)))
#endif
#if defined(LIBXSMM_INTEL_COMPILER)
# if (1700 <= LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_ASSUME(EXPRESSION) __assume(EXPRESSION)
# else
# define LIBXSMM_ASSUME(EXPRESSION) assert(EXPRESSION)
# endif
#elif defined(_MSC_VER)
# define LIBXSMM_ASSUME(EXPRESSION) __assume(EXPRESSION)
#elif defined(__GNUC__) && !defined(_CRAYC) && (LIBXSMM_VERSION2(4, 5) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))
# define LIBXSMM_ASSUME(EXPRESSION) do { if (!(EXPRESSION)) __builtin_unreachable(); } while(0)
#else
# define LIBXSMM_ASSUME(EXPRESSION) assert(EXPRESSION)
#endif
#if defined(__INTEL_COMPILER)
# define LIBXSMM_ASSUME_ALIGNED(A, N) __assume_aligned(A, N)
#else
# define LIBXSMM_ASSUME_ALIGNED(A, N) assert(0 == ((uintptr_t)(A)) % (N))
#endif
#define LIBXSMM_ALIGN(POINTER, ALIGNMENT/*POT*/) ((POINTER) + (LIBXSMM_UP2((uintptr_t)(POINTER), ALIGNMENT) - ((uintptr_t)(POINTER))) / sizeof(*(POINTER)))
#define LIBXSMM_FOLD2(POINTER, ALIGNMENT, NPOT) LIBXSMM_MOD2(((uintptr_t)(POINTER) / (ALIGNMENT)), NPOT)
#if defined(_MSC_VER) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) /* account for incorrect handling of __VA_ARGS__ */
# define LIBXSMM_SELECT_ELEMENT(INDEX1/*one-based*/, .../*elements*/) LIBXSMM_CONCATENATE(LIBXSMM_SELECT_ELEMENT_, INDEX1)LIBXSMM_EXPAND((__VA_ARGS__))
#else
# define LIBXSMM_SELECT_ELEMENT(INDEX1/*one-based*/, .../*elements*/) LIBXSMM_CONCATENATE(LIBXSMM_SELECT_ELEMENT_, INDEX1)(__VA_ARGS__)
#endif
#define LIBXSMM_SELECT_ELEMENT_1(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E0
#define LIBXSMM_SELECT_ELEMENT_2(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E1
#define LIBXSMM_SELECT_ELEMENT_3(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E2
#define LIBXSMM_SELECT_ELEMENT_4(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E3
#define LIBXSMM_SELECT_ELEMENT_5(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E4
#define LIBXSMM_SELECT_ELEMENT_6(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E5
#define LIBXSMM_SELECT_ELEMENT_7(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E6
#define LIBXSMM_SELECT_ELEMENT_8(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E7
#define LIBXSMM_SELECT_ELEMENT_9(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E8
#define LIBXSMM_SELECT_ELEMENT_10(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E9
#define LIBXSMM_SELECT_HEAD_AUX(A, ...) (A)
#define LIBXSMM_SELECT_HEAD(...) LIBXSMM_EXPAND(LIBXSMM_SELECT_HEAD_AUX(__VA_ARGS__, 0/*dummy*/))
#define LIBXSMM_SELECT_TAIL(A, ...) __VA_ARGS__
/**
* For VLAs, check EXACTLY for C99 since a C11-conforming compiler may not provide VLAs.
* However, some compilers (Intel) may signal support for VLA even with strict ANSI (C89).
* To ultimately disable VLA-support, define LIBXSMM_NO_VLA (make VLA=0).
* VLA-support is signaled by LIBXSMM_VLA.
*/
#if !defined(LIBXSMM_VLA) && !defined(LIBXSMM_NO_VLA) && !defined(__PGI) && ( \
(defined(__STDC_VERSION__) && (199901L/*C99*/ == __STDC_VERSION__ || (!defined(__STDC_NO_VLA__) && 199901L/*C99*/ < __STDC_VERSION__))) || \
(defined(__GNUC__) && LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) && !defined(__STRICT_ANSI__) && !defined(__cplusplus)) || \
(defined(LIBXSMM_INTEL_COMPILER) && !defined(_WIN32) && !defined(__cplusplus)) || \
(defined(__INTEL_COMPILER) && !defined(_WIN32)))
# define LIBXSMM_VLA
#endif
/**
* LIBXSMM_INDEX1 calculates the linear address for a given set of (multiple) indexes/bounds.
* Syntax: LIBXSMM_INDEX1(<ndims>, <i0>, ..., <i(ndims-1)>, <s1>, ..., <s(ndims-1)>).
* Please note that the leading dimension (s0) is omitted in the above syntax!
* TODO: support leading dimension (pitch/stride).
*/
#if defined(_MSC_VER) && !defined(__clang__) /* account for incorrect handling of __VA_ARGS__ */
# define LIBXSMM_INDEX1(NDIMS, ...) LIBXSMM_CONCATENATE(LIBXSMM_INDEX1_, NDIMS)LIBXSMM_EXPAND((__VA_ARGS__))
#else
# define LIBXSMM_INDEX1(NDIMS, ...) LIBXSMM_CONCATENATE(LIBXSMM_INDEX1_, NDIMS)(__VA_ARGS__)
#endif
#define LIBXSMM_INDEX1_1(...) ((size_t)LIBXSMM_SELECT_HEAD(__VA_ARGS__))
#define LIBXSMM_INDEX1_2(I0, I1, S1) (LIBXSMM_INDEX1_1(I0) * ((size_t)S1) + (size_t)I1)
#define LIBXSMM_INDEX1_3(I0, I1, I2, S1, S2) (LIBXSMM_INDEX1_2(I0, I1, S1) * ((size_t)S2) + (size_t)I2)
#define LIBXSMM_INDEX1_4(I0, I1, I2, I3, S1, S2, S3) (LIBXSMM_INDEX1_3(I0, I1, I2, S1, S2) * ((size_t)S3) + (size_t)I3)
#define LIBXSMM_INDEX1_5(I0, I1, I2, I3, I4, S1, S2, S3, S4) (LIBXSMM_INDEX1_4(I0, I1, I2, I3, S1, S2, S3) * ((size_t)S4) + (size_t)I4)
#define LIBXSMM_INDEX1_6(I0, I1, I2, I3, I4, I5, S1, S2, S3, S4, S5) (LIBXSMM_INDEX1_5(I0, I1, I2, I3, I4, S1, S2, S3, S4) * ((size_t)S5) + (size_t)I5)
#define LIBXSMM_INDEX1_7(I0, I1, I2, I3, I4, I5, I6, S1, S2, S3, S4, S5, S6) (LIBXSMM_INDEX1_6(I0, I1, I2, I3, I4, I5, S1, S2, S3, S4, S5) * ((size_t)S6) + (size_t)I6)
#define LIBXSMM_INDEX1_8(I0, I1, I2, I3, I4, I5, I6, I7, S1, S2, S3, S4, S5, S6, S7) (LIBXSMM_INDEX1_7(I0, I1, I2, I3, I4, I5, I6, S1, S2, S3, S4, S5, S6) * ((size_t)S7) + (size_t)I7)
#define LIBXSMM_INDEX1_9(I0, I1, I2, I3, I4, I5, I6, I7, I8, S1, S2, S3, S4, S5, S6, S7, S8) (LIBXSMM_INDEX1_8(I0, I1, I2, I3, I4, I5, I6, I7, S1, S2, S3, S4, S5, S6, S7) * ((size_t)S8) + (size_t)I8)
#define LIBXSMM_INDEX1_10(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, S1, S2, S3, S4, S5, S6, S7, S8, S9) (LIBXSMM_INDEX1_9(I0, I1, I2, I3, I4, I5, I6, I7, I8, S1, S2, S3, S4, S5, S6, S7, S8) * ((size_t)S9) + (size_t)I9)
/**
* LIBXSMM_VLA_DECL declares an array according to the given set of (multiple) bounds.
* Syntax: LIBXSMM_VLA_DECL(<ndims>, <elem-type>, <var-name>, <init>, <s1>, ..., <s(ndims-1)>).
* The element type can be "const" or otherwise qualified; initial value must be (const)element-type*.
* Please note that the syntax is similar to LIBXSMM_INDEX1, and the leading dimension (s0) is omitted!
*
* LIBXSMM_VLA_ACCESS gives the array element according to the given set of (multiple) indexes/bounds.
* Syntax: LIBXSMM_VLA_ACCESS(<ndims>, <array>, <i0>, ..., <i(ndims-1)>, <s1>, ..., <s(ndims-1)>).
* Please note that the syntax is similar to LIBXSMM_INDEX1, and the leading dimension (s0) is omitted!
*/
#if !defined(LIBXSMM_VLA_POSTFIX)
# define LIBXSMM_VLA_POSTFIX _
#endif
#if defined(LIBXSMM_VLA)
LIBXSMM_API_INLINE int libxsmm_nonconst_int(int i) { return i; }
# define LIBXSMM_VLA_ACCESS(NDIMS, ARRAY, ...) LIBXSMM_VLA_ACCESS_ND(NDIMS, LIBXSMM_CONCATENATE(ARRAY, LIBXSMM_VLA_POSTFIX), LIBXSMM_VLA_ACCESS_SINK, __VA_ARGS__)
# define LIBXSMM_VLA_ACCESS_SINK(S) + 0 * (S)
# define LIBXSMM_VLA_ACCESS_NONCONST(I) libxsmm_nonconst_int(I)
# define LIBXSMM_VLA_ACCESS_ND(NDIMS, ARRAY, XY, ...) LIBXSMM_CONCATENATE3(LIBXSMM_VLA_ACCESS_, NDIMS, D)(ARRAY, XY, __VA_ARGS__)
# define LIBXSMM_VLA_ACCESS_0D(ARRAY, XY, ...) (ARRAY)/*scalar*/
# define LIBXSMM_VLA_ACCESS_1D(ARRAY, XY, ...) ((ARRAY)[LIBXSMM_VLA_ACCESS_NONCONST(LIBXSMM_SELECT_HEAD(__VA_ARGS__))])
# define LIBXSMM_VLA_ACCESS_2D(ARRAY, XY, I0, I1, ...) (((ARRAY) XY(__VA_ARGS__))[I0][LIBXSMM_VLA_ACCESS_NONCONST(I1)])
# define LIBXSMM_VLA_ACCESS_3D(ARRAY, XY, I0, I1, I2, S1, ...) (((ARRAY) XY(S1) XY(__VA_ARGS__))[I0][I1][LIBXSMM_VLA_ACCESS_NONCONST(I2)])
# define LIBXSMM_VLA_ACCESS_4D(ARRAY, XY, I0, I1, I2, I3, S1, S2, ...) (((ARRAY) XY(S1) XY(S2) XY(__VA_ARGS__))[I0][I1][I2][LIBXSMM_VLA_ACCESS_NONCONST(I3)])
# define LIBXSMM_VLA_ACCESS_5D(ARRAY, XY, I0, I1, I2, I3, I4, S1, S2, S3, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(__VA_ARGS__))[I0][I1][I2][I3][LIBXSMM_VLA_ACCESS_NONCONST(I4)])
# define LIBXSMM_VLA_ACCESS_6D(ARRAY, XY, I0, I1, I2, I3, I4, I5, S1, S2, S3, S4, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][LIBXSMM_VLA_ACCESS_NONCONST(I5)])
# define LIBXSMM_VLA_ACCESS_7D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, S1, S2, S3, S4, S5, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][LIBXSMM_VLA_ACCESS_NONCONST(I6)])
# define LIBXSMM_VLA_ACCESS_8D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, I7, S1, S2, S3, S4, S5, S6, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(S6) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][I6][LIBXSMM_VLA_ACCESS_NONCONST(I7)])
# define LIBXSMM_VLA_ACCESS_9D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, I7, I8, S1, S2, S3, S4, S5, S6, S7, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(S6) XY(S7) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][I6][I7][LIBXSMM_VLA_ACCESS_NONCONST(I8)])
# define LIBXSMM_VLA_ACCESS_10D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, S1, S2, S3, S4, S5, S6, S7, S8, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(S6) XY(S7) XY(S8) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][I6][I7][I8][LIBXSMM_VLA_ACCESS_NONCONST(I9)])
# define LIBXSMM_VLA_DECL(NDIMS, ELEMENT_TYPE, ARRAY_VAR, .../*initial value, and bounds*/) \
ELEMENT_TYPE LIBXSMM_VLA_ACCESS_ND(LIBXSMM_SELECT_ELEMENT(NDIMS, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), *LIBXSMM_RESTRICT LIBXSMM_CONCATENATE(ARRAY_VAR, LIBXSMM_VLA_POSTFIX), \
LIBXSMM_ELIDE, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)/*bounds*/, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)/*dummy*/) = \
(ELEMENT_TYPE LIBXSMM_VLA_ACCESS_ND(LIBXSMM_SELECT_ELEMENT(NDIMS, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), *, \
LIBXSMM_ELIDE, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)/*bounds*/, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)/*dummy*/))LIBXSMM_SELECT_HEAD(__VA_ARGS__)
#else /* calculate linear index */
# define LIBXSMM_VLA_ACCESS(NDIMS, ARRAY, ...) LIBXSMM_CONCATENATE(ARRAY, LIBXSMM_VLA_POSTFIX)[LIBXSMM_INDEX1(NDIMS, __VA_ARGS__)]
# define LIBXSMM_VLA_DECL(NDIMS, ELEMENT_TYPE, ARRAY_VAR, .../*initial value, and bounds*/) \
ELEMENT_TYPE *LIBXSMM_RESTRICT LIBXSMM_CONCATENATE(ARRAY_VAR, LIBXSMM_VLA_POSTFIX) = /*(ELEMENT_TYPE*)*/LIBXSMM_SELECT_HEAD(__VA_ARGS__) \
+ 0 * LIBXSMM_INDEX1(NDIMS, LIBXSMM_SELECT_TAIL(__VA_ARGS__, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0))) /* dummy-shift to "sink" unused arguments */
#endif
/** Access an array of TYPE with Byte-measured stride. */
#define LIBXSMM_ACCESS(TYPE, ARRAY, STRIDE) (*(TYPE*)((char*)(ARRAY) + (STRIDE)))
#if !defined(LIBXSMM_UNUSED)
# if 0
# define LIBXSMM_UNUSED(VARIABLE) LIBXSMM_PRAGMA(unused(VARIABLE))
# else
# define LIBXSMM_UNUSED(VARIABLE) (void)(VARIABLE)
# endif
#endif
#if !defined(NDEBUG)
# define LIBXSMM_UNUSED_DEBUG(VARIABLE) LIBXSMM_UNUSED(VARIABLE)
#else
# define LIBXSMM_UNUSED_DEBUG(VARIABLE)
#endif
#if defined(_OPENMP)
# define LIBXSMM_PRAGMA_OMP(...) LIBXSMM_PRAGMA(omp __VA_ARGS__)
# if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
# define LIBXSMM_OMP_VAR(A) LIBXSMM_UNUSED(A) /* suppress warning about "unused" variable */
# elif defined(__clang__)
# define LIBXSMM_OMP_VAR(A) (A) = 0
# else
# define LIBXSMM_OMP_VAR(A)
# endif
#else
# define LIBXSMM_PRAGMA_OMP(...)
# define LIBXSMM_OMP_VAR(A)
#endif
#if defined(LIBXSMM_BUILD) && (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__) && !defined(__MINGW32__)
# define LIBXSMM_ATTRIBUTE_WEAK_IMPORT LIBXSMM_ATTRIBUTE(weak_import)
# define LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE(weak)
#else
# define LIBXSMM_ATTRIBUTE_WEAK
# define LIBXSMM_ATTRIBUTE_WEAK_IMPORT
#endif
#if !defined(LIBXSMM_NO_CTOR) && !defined(LIBXSMM_CTOR) && \
(defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__)) && \
(defined(LIBXSMM_BUILD) && !defined(__STATIC)) && \
(defined(__GNUC__) || defined(__clang__))
# define LIBXSMM_ATTRIBUTE_CTOR LIBXSMM_ATTRIBUTE(constructor)
# define LIBXSMM_ATTRIBUTE_DTOR LIBXSMM_ATTRIBUTE(destructor)
# define LIBXSMM_CTOR
#else
# define LIBXSMM_ATTRIBUTE_CTOR
# define LIBXSMM_ATTRIBUTE_DTOR
#endif
#if defined(__GNUC__) && !defined(__PGI) && !defined(__ibmxl__)
# define LIBXSMM_ATTRIBUTE_NO_TRACE LIBXSMM_ATTRIBUTE(no_instrument_function)
#else
# define LIBXSMM_ATTRIBUTE_NO_TRACE
#endif
#if defined(__GNUC__)
# define LIBXSMM_MAY_ALIAS LIBXSMM_ATTRIBUTE(__may_alias__)
#else
# define LIBXSMM_MAY_ALIAS
#endif
#if !defined(LIBXSMM_MKTEMP_PATTERN)
# define LIBXSMM_MKTEMP_PATTERN "XXXXXX"
#endif
/** Below group is to fix-up some platform/compiler specifics. */
#if defined(_WIN32)
# if !defined(_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES)
# define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1
# endif
# if !defined(_CRT_SECURE_NO_DEPRECATE)
# define _CRT_SECURE_NO_DEPRECATE 1
# endif
# if !defined(_USE_MATH_DEFINES)
# define _USE_MATH_DEFINES 1
# endif
# if !defined(WIN32_LEAN_AND_MEAN)
# define WIN32_LEAN_AND_MEAN 1
# endif
# if !defined(NOMINMAX)
# define NOMINMAX 1
# endif
# if defined(__INTEL_COMPILER) && (190023506 <= _MSC_FULL_VER)
# define __builtin_huge_val() HUGE_VAL
# define __builtin_huge_valf() HUGE_VALF
# define __builtin_nan nan
# define __builtin_nanf nanf
# define __builtin_nans nan
# define __builtin_nansf nanf
# if defined(__cplusplus)
# define _CMATH_
# endif
# endif
#endif
#if !defined(_GNU_SOURCE) && defined(LIBXSMM_BUILD)
# define _GNU_SOURCE
#endif
#if !defined(__STDC_FORMAT_MACROS)
# define __STDC_FORMAT_MACROS
#endif
#if defined(__clang__) && !defined(__extern_always_inline)
# define __extern_always_inline LIBXSMM_INLINE
#endif
#if defined(LIBXSMM_INLINE_FIXUP) && !defined(inline)
# define inline LIBXSMM_INLINE_KEYWORD
#endif
#if (0 != LIBXSMM_SYNC)
# if !defined(_REENTRANT)
# define _REENTRANT
# endif
# if defined(__PGI)
# if defined(__GCC_ATOMIC_TEST_AND_SET_TRUEVAL)
# undef __GCC_ATOMIC_TEST_AND_SET_TRUEVAL
# endif
# define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
# endif
#endif
#if !defined(__has_feature) && !defined(__clang__)
# define __has_feature(A) 0
#endif
#if !defined(__has_builtin) && !defined(__clang__)
# define __has_builtin(A) 0
#endif
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
#endif
#if (0 != LIBXSMM_SYNC)
# if defined(_WIN32) || defined(__CYGWIN__)
# include <windows.h>
# else
# include <pthread.h>
# endif
#endif
#if !defined(LIBXSMM_ASSERT)
# include <assert.h>
# if defined(NDEBUG)
# define LIBXSMM_ASSERT(EXPR) LIBXSMM_ASSUME(EXPR)
# else
# define LIBXSMM_ASSERT(EXPR) assert(EXPR)
# endif
#endif
#if !defined(LIBXSMM_ASSERT_MSG)
# define LIBXSMM_ASSERT_MSG(EXPR, MSG) assert((EXPR) && *MSG)
#endif
#if !defined(LIBXSMM_EXPECT_ELIDE)
# define LIBXSMM_EXPECT_ELIDE(RESULT, EXPR) do { \
/*const*/ int libxsmm_expect_result_ = ((RESULT) == (EXPR)); \
LIBXSMM_UNUSED(libxsmm_expect_result_); \
} while(0)
#endif
#if defined(NDEBUG)
# define LIBXSMM_EXPECT LIBXSMM_EXPECT_ELIDE
# define LIBXSMM_EXPECT_NOT LIBXSMM_EXPECT_ELIDE
#else
# define LIBXSMM_EXPECT(RESULT, EXPR) LIBXSMM_ASSERT((RESULT) == (EXPR))
# define LIBXSMM_EXPECT_NOT(RESULT, EXPR) LIBXSMM_ASSERT((RESULT) != (EXPR))
#endif
#if defined(_DEBUG)
# define LIBXSMM_EXPECT_DEBUG LIBXSMM_EXPECT
#else
# define LIBXSMM_EXPECT_DEBUG LIBXSMM_EXPECT_ELIDE
#endif
#if defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP)
# include <omp.h>
#endif
#include <inttypes.h>
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <float.h>
#include <stdio.h>
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(pop)
#endif
#if !defined(FLT_MAX)
# if !defined(__FLT_MAX__)
# define FLT_MAX 3.40282346638528859811704183484516925e+38F
# else
# define FLT_MAX __FLT_MAX__
# endif
#endif
#if !defined(FLT_MIN)
# if !defined(__FLT_MIN__)
# define FLT_MIN 1.17549435082228750796873653722224568e-38F
# else
# define FLT_MIN __FLT_MIN__
# endif
#endif
#if defined(_WIN32) && 0
# define LIBXSMM_SNPRINTF(S, N, ...) _snprintf_s(S, N, _TRUNCATE, __VA_ARGS__)
#elif defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__ || defined(__GNUC__))
# define LIBXSMM_SNPRINTF(S, N, ...) snprintf(S, N, __VA_ARGS__)
#else
# define LIBXSMM_SNPRINTF(S, N, ...) sprintf((S) + /*unused*/(N) * 0, __VA_ARGS__)
#endif
#if defined(__THROW) && defined(__cplusplus)
# define LIBXSMM_THROW __THROW
#endif
#if !defined(LIBXSMM_THROW)
# define LIBXSMM_THROW
#endif
#if defined(__GNUC__) && LIBXSMM_VERSION2(4, 2) == LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) && \
!defined(__clang__) && !defined(__PGI) && !defined(__INTEL_COMPILER) && !defined(_CRAYC)
# define LIBXSMM_NOTHROW LIBXSMM_THROW
#else
# define LIBXSMM_NOTHROW
#endif
#if defined(__cplusplus)
# if (__cplusplus > 199711L)
# define LIBXSMM_NOEXCEPT noexcept
# else
# define LIBXSMM_NOEXCEPT throw()
# endif
#else
# define LIBXSMM_NOEXCEPT LIBXSMM_NOTHROW
#endif
#if defined(_WIN32)
# define LIBXSMM_PUTENV(A) _putenv(A)
#else
# define LIBXSMM_PUTENV(A) putenv(A)
#endif
/* block must be after including above header files */
#if (defined(__GLIBC__) && defined(__GLIBC_MINOR__) && LIBXSMM_VERSION2(__GLIBC__, __GLIBC_MINOR__) < LIBXSMM_VERSION2(2, 26)) \
|| (defined(LIBXSMM_INTEL_COMPILER) && (1802 >= LIBXSMM_INTEL_COMPILER) && !defined(__cplusplus) && defined(__linux__))
/* _Float128 was introduced with GNU GCC 7.0. */
# if !defined(_Float128) && !defined(__SIZEOF_FLOAT128__) && defined(__GNUC__) && !defined(__cplusplus) && defined(__linux__)
# define _Float128 __float128
# endif
# if !defined(LIBXSMM_GLIBC_FPTYPES) && defined(__GNUC__) && !defined(__cplusplus) && defined(__linux__) \
&& (LIBXSMM_VERSION2(7, 0) > LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) || \
(defined(LIBXSMM_INTEL_COMPILER) && (1802 >= LIBXSMM_INTEL_COMPILER)))
# define LIBXSMM_GLIBC_FPTYPES
# endif
# if !defined(_Float128X) && defined(LIBXSMM_GLIBC_FPTYPES)
# define _Float128X _Float128
# endif
# if !defined(_Float32) && defined(LIBXSMM_GLIBC_FPTYPES)
# define _Float32 float
# endif
# if !defined(_Float32x) && defined(LIBXSMM_GLIBC_FPTYPES)
# define _Float32x _Float32
# endif
# if !defined(_Float64) && defined(LIBXSMM_GLIBC_FPTYPES)
# define _Float64 double
# endif
# if !defined(_Float64x) && defined(LIBXSMM_GLIBC_FPTYPES)
# define _Float64x _Float64
# endif
#endif
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
#endif
#if defined(LIBXSMM_GLIBC_FPTYPES)
# if defined(__cplusplus)
# undef __USE_MISC
# if !defined(_DEFAULT_SOURCE)
# define _DEFAULT_SOURCE
# endif
# if !defined(_BSD_SOURCE)
# define _BSD_SOURCE
# endif
# else
# if !defined(__PURE_INTEL_C99_HEADERS__)
# define __PURE_INTEL_C99_HEADERS__
# endif
# endif
#endif
#if !defined(LIBXSMM_NO_LIBM)
# if (defined(LIBXSMM_INTEL_COMPILER) && (1800 <= LIBXSMM_INTEL_COMPILER)) \
&& !defined(_WIN32) /* error including dfp754.h */
# include <mathimf.h>
# endif
# include <math.h>
#endif
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(pop)
#endif
#endif /*LIBXSMM_MACROS_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_MALLOC_H
#define LIBXSMM_MALLOC_H
#include "libxsmm_memory.h"
/* include tensorflow/core/public/version.h prior to LIBXSMM otherwise the current TensorFlow API is assumed */
#if !defined(LIBXSMM_TF12) && (!defined(TF_VERSION_STRING) || \
LIBXSMM_VERSION2(1, 12) <= LIBXSMM_VERSION2(TF_MAJOR_VERSION, TF_MINOR_VERSION))
# define LIBXSMM_TF12 /* TF_PATCH_VERSION does not matter */
#endif
/** Can be used with libxsmm_[get|set]_scratch_limit. */
#define LIBXSMM_SCRATCH_UNLIMITED ((size_t)LIBXSMM_UNLIMITED)
#define LIBXSMM_SCRATCH_DEFAULT 0
/** Function types accepted for memory allocation (see libxsmm_*_allocator). */
LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void* (*libxsmm_malloc_ctx)(size_t /*size*/, const void* /*context*/);
LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void* (*libxsmm_malloc_fun)(size_t /*size*/);
LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_malloc_function {
libxsmm_malloc_ctx ctx_form;
libxsmm_malloc_fun function;
} libxsmm_malloc_function;
/** Function types accepted for releasing memory (see libxsmm_*_allocator). */
LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_free_ctx)(void* /*buffer*/, const void* /*context*/);
LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_free_fun)(void* /*buffer*/);
LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_free_function {
libxsmm_free_ctx ctx_form;
libxsmm_free_fun function;
} libxsmm_free_function;
/**
* To setup the custom default memory allocator, either a malloc_fn and a free_fn
* are given, or two NULL-pointers designate to reset the default allocator to a
* library-internal default. If a context is given (non-NULL), the context-based
* form of the memory allocation is used.
* Changing the allocator including the function for deallocation applies to
* upcoming allocation/deallocation and works correctly for pending buffers.
*/
LIBXSMM_API int libxsmm_set_default_allocator(/* malloc_fn/free_fn must correspond */
const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn);
/** Retrieve the default memory allocator. */
LIBXSMM_API int libxsmm_get_default_allocator(const void** context,
libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn);
/**
* To setup the scratch memory allocator, a malloc_fn function and an optional free_fn
* are given. A NULL-free acts as a "no-operation", and the deallocation is expected
* to be controlled otherwise. If two NULL-pointers are given, the allocator is reset
* to the currently active default memory allocator. If a context is given (non-NULL),
* the context-based form of the memory allocation is used.
* Changing the allocator including the function for deallocation applies to
* upcoming allocation/deallocation and works correctly for pending buffers.
*/
LIBXSMM_API int libxsmm_set_scratch_allocator(/* malloc_fn/free_fn must correspond */
const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn);
/** Retrieve the scratch memory allocator. */
LIBXSMM_API int libxsmm_get_scratch_allocator(const void** context,
libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn);
/** Allocate memory (malloc/free interface). */
LIBXSMM_API LIBXSMM_ATTRIBUTE_MALLOC void* libxsmm_malloc(size_t size);
/** Allocate aligned memory using the default allocator. */
LIBXSMM_API LIBXSMM_ATTRIBUTE_MALLOC void* libxsmm_aligned_malloc(size_t size,
/**
* =0: align automatically according to the size
* 0<: align according to the alignment value
*/
size_t alignment);
/** Reallocate memory using the default allocator (alignment is preserved). */
LIBXSMM_API void* libxsmm_realloc(size_t size, void* ptr);
/**
* Allocate aligned scratch memory. It is not supported
* to query properties per libxsmm_get_malloc_info, but
* libxsmm_get_scratch_info can used instead.
*/
LIBXSMM_API void* libxsmm_scratch_malloc(size_t size,
/**
* =0: align automatically according to the size
* 0<: align according to the alignment value
*/
size_t alignment,
/**
* Identifies the call site, which is used
* to determine the memory pool.
*/
const void* caller);
/**
* Binary form of libxsmm_scratch_malloc, which
* expands the call-context automatically. This
* macro is intentionally lower case.
*/
#define libxsmm_aligned_scratch(size, alignment) \
libxsmm_scratch_malloc(size, alignment, \
LIBXSMM_CALLER_ID)
/** Deallocate memory (malloc/free interface). */
LIBXSMM_API void libxsmm_free(const void* memory);
/**
* Release the entire scratch memory regardless
* of whether it is still referenced or not.
*/
LIBXSMM_API void libxsmm_release_scratch(void);
/** Information about a buffer (default memory domain). */
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_malloc_info {
/** Size of the buffer. */
size_t size;
} libxsmm_malloc_info;
/** Retrieve information about a buffer (default memory domain). */
LIBXSMM_API int libxsmm_get_malloc_info(const void* memory, libxsmm_malloc_info* info);
/** Information about the scratch memory domain. */
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_scratch_info {
/** Watermark memory across pools (size), unsatisfied (local), and library-internal memory. */
size_t size, local, internal;
/** Pending allocations (not released). */
size_t npending;
/** Number of allocations so far. */
size_t nmallocs;
/** Number of pools used. */
unsigned int npools;
} libxsmm_scratch_info;
/** Retrieve information about the scratch memory domain. */
LIBXSMM_API int libxsmm_get_scratch_info(libxsmm_scratch_info* info);
/**
* Limit the total size (Bytes) of the scratch memory.
* LIBXSMM_SCRATCH_UNLIMITED removes any limit, and
* LIBXSMM_SCRATCH_DEFAULT populates the default.
* The related environment variable LIBXSMM_SCRATCH_LIMIT
* allows units: <none>/b/B (Bytes), k/K, m/M, and g/G.
*/
LIBXSMM_API void libxsmm_set_scratch_limit(size_t nbytes);
/** Get the maximum size of the scratch memory domain. */
LIBXSMM_API size_t libxsmm_get_scratch_limit(void);
/**
* Intercepts malloc/free to use scratch memory allocator.
* (related environment variable LIBXSMM_MALLOC).
* Optionally set the range of malloc-sizes to be intercepted.
* The related environment variable LIBXSMM_MALLOC_LIMIT
* allows units: <none>/b/B (Bytes), k/K, m/M, and g/G.
*/
LIBXSMM_API void libxsmm_set_malloc(int enabled, const size_t* lo, const size_t* hi);
/**
* Determines if malloc/free are (and can be) intercepted.
* Optionally gets the range of enabled malloc-sizes.
*/
LIBXSMM_API int libxsmm_get_malloc(size_t* lo, size_t* hi);
/**
* Calculate the linear offset of the n-dimensional (ndims) offset (can be NULL),
* and the (optional) linear size of the corresponding shape.
*/
LIBXSMM_API size_t libxsmm_offset(const size_t offset[], const size_t shape[], size_t ndims, size_t* size);
#if defined(__cplusplus)
/** RAII idiom to temporarily setup an allocator for the lifetime of the scope. */
template<typename kind> class LIBXSMM_RETARGETABLE libxsmm_scoped_allocator {
public:
/** C'tor, which instantiates the new allocator (plain form). */
libxsmm_scoped_allocator(libxsmm_malloc_fun malloc_fn, libxsmm_free_fun free_fn) {
kind::get(m_context, m_malloc, m_free);
kind::set(NULL/*context*/, NULL/*malloc_ctx*/, NULL/*free_ctx*/, malloc_fn, free_fn);
}
/** C'tor, which instantiates the new allocator (context form). */
libxsmm_scoped_allocator(const void* context, libxsmm_malloc_ctx malloc_ctx, libxsmm_free_ctx free_ctx,
libxsmm_malloc_fun malloc_fun = NULL, libxsmm_free_fun free_fun = NULL)
{
kind::get(m_context, m_malloc, m_free);
kind::set(context, malloc_ctx, free_ctx, malloc_fun, free_fun);
}
/** Following the RAII idiom, the d'tor restores the previous allocator. */
~libxsmm_scoped_allocator() {
kind::set(m_context,
m_malloc.ctx_form, m_free.ctx_form,
m_malloc.function, m_free.function);
}
private: /* no copy/assignment */
explicit libxsmm_scoped_allocator(const libxsmm_scoped_allocator&);
libxsmm_scoped_allocator& operator=(const libxsmm_scoped_allocator&);
protected: /* saved/previous allocator */
const void* m_context;
libxsmm_malloc_function m_malloc;
libxsmm_free_function m_free;
};
/** Allocator-kind to instantiate libxsmm_scoped_allocator<kind>. */
struct LIBXSMM_RETARGETABLE libxsmm_default_allocator {
static void set(const void* context,
libxsmm_malloc_ctx malloc_ctx, libxsmm_free_ctx free_ctx,
libxsmm_malloc_fun malloc_fun, libxsmm_free_fun free_fun)
{
libxsmm_malloc_function malloc_fn;
libxsmm_free_function free_fn;
if (NULL == context) { /* use global form only when no context is given */
malloc_fn.function = malloc_fun; free_fn.function = free_fun;
}
else {
malloc_fn.ctx_form = malloc_ctx; free_fn.ctx_form = free_ctx;
}
libxsmm_set_default_allocator(context, malloc_fn, free_fn);
}
static void get(const void*& context,
libxsmm_malloc_function& malloc_fn, libxsmm_free_function& free_fn)
{
libxsmm_get_default_allocator(&context, &malloc_fn, &free_fn);
}
};
/** Allocator-kind to instantiate libxsmm_scoped_allocator<kind>. */
struct LIBXSMM_RETARGETABLE libxsmm_scratch_allocator {
static void set(const void* context,
libxsmm_malloc_ctx malloc_ctx, libxsmm_free_ctx free_ctx,
libxsmm_malloc_fun malloc_fun, libxsmm_free_fun free_fun)
{
libxsmm_malloc_function malloc_fn;
libxsmm_free_function free_fn;
if (NULL != context) { /* adopt context form */
malloc_fn.function = malloc_fun; free_fn.function = free_fun;
}
else { /* adopt global form */
malloc_fn.ctx_form = malloc_ctx; free_fn.ctx_form = free_ctx;
}
libxsmm_set_scratch_allocator(context, malloc_fn, free_fn);
}
static void get(const void*& context,
libxsmm_malloc_function& malloc_fn, libxsmm_free_function& free_fn)
{
libxsmm_get_scratch_allocator(&context, &malloc_fn, &free_fn);
}
};
/** Forward-declared types/functions used to implement libxsmm_tf_allocator. */
namespace tensorflow {
class Allocator;
#if defined(LIBXSMM_TF12)
class DeviceBase; int DeviceNumaNode(const DeviceBase* /*device*/);
Allocator* cpu_allocator(int /*numa_node*/);
#else
Allocator* cpu_allocator();
#endif
}
/**
* An object of this type adopts a memory allocator from TensorFlow.
* All memory allocations of the requested kind within the current
* scope (where the libxsmm_tf_allocator object lives) are subject
* to TensorFlow's memory allocation scheme. The allocation kind
* is usually "libxsmm_scratch_allocator"; using a second object
* of kind "libxsmm_default_allocator" makes the default memory
* allocation of LIBXSMM subject to TensorFlow as well.
*/
template<typename kind> class LIBXSMM_RETARGETABLE libxsmm_tf_allocator:
public libxsmm_scoped_allocator<kind>
{
public:
/** The TensorFlow allocator is adopted from the global CPU memory allocator. */
explicit libxsmm_tf_allocator()
: libxsmm_scoped_allocator<kind>(
libxsmm_tf_allocator::malloc,
libxsmm_tf_allocator::free)
{}
/** The TensorFlow allocator is adopted from the given OpKernelContext. */
template<typename context_type>
explicit libxsmm_tf_allocator(context_type& context)
: libxsmm_scoped_allocator<kind>(&context,
libxsmm_tf_allocator::template malloc_ctx<context_type>,
libxsmm_tf_allocator::template free_ctx<context_type>,
libxsmm_tf_allocator::malloc,
libxsmm_tf_allocator::free)
{}
/** Global form of allocating memory (malloc signature). */
static void* malloc(size_t size) {
#if defined(LIBXSMM_TF12)
return libxsmm_tf_allocator::allocate(tensorflow::cpu_allocator(-1/*kNUMANoAffinity*/), size);
#else
return libxsmm_tf_allocator::allocate(tensorflow::cpu_allocator(), size);
#endif
}
/** Global form of deallocating memory (free signature). */
static void free(void* buffer) {
#if defined(LIBXSMM_TF12)
libxsmm_tf_allocator::deallocate(tensorflow::cpu_allocator(-1/*kNUMANoAffinity*/), buffer);
#else
libxsmm_tf_allocator::deallocate(tensorflow::cpu_allocator(), buffer);
#endif
}
/** Context based form of allocating memory. */
template<typename context_type> static void* malloc_ctx(const void* context, size_t size) {
typedef typename context_type::WrappedAllocator::first_type allocator_ptr;
context_type *const tf_context = static_cast<context_type*>(context);
allocator_ptr allocator = NULL;
if (NULL != tf_context) {
#if !defined(LIBXSMM_TF12)
if (NULL != tf_context->device()) {
if (0 < tf_context->num_outputs()) {
allocator = tf_context->device()->GetStepAllocator(
tf_context->output_alloc_attr(0),
tf_context->resource_manager());
}
else if (0 < tf_context->num_inputs()) {
allocator = tf_context->device()->GetStepAllocator(
tf_context->input_alloc_attr(0),
tf_context->resource_manager());
}
}
#else /* include tensorflow/core/public/version.h prior to LIBXSMM otherwise the current TensorFlow API is assumed */
const int numa_node = DeviceNumaNode(tf_context->device());
allocator = tensorflow::cpu_allocator(numa_node);
#endif
}
return libxsmm_tf_allocator::allocate(allocator, size);
}
/** Context based form of deallocating memory. */
template<typename context_type> static void free_ctx(const void* context, void* buffer) {
typedef typename context_type::WrappedAllocator::first_type allocator_ptr;
context_type *const tf_context = static_cast<context_type*>(context);
allocator_ptr allocator = NULL;
if (NULL != tf_context) {
#if defined(LIBXSMM_TF12)
const int numa_node = DeviceNumaNode(tf_context->device());
allocator = tensorflow::cpu_allocator(numa_node);
#else
if (NULL != tf_context->device()) {
if (0 < tf_context->num_outputs()) {
allocator = tf_context->device()->GetStepAllocator(
tf_context->output_alloc_attr(0),
tf_context->resource_manager());
}
else if (0 < tf_context->num_inputs()) {
allocator = tf_context->device()->GetStepAllocator(
tf_context->input_alloc_attr(0),
tf_context->resource_manager());
}
}
#endif
}
libxsmm_tf_allocator::deallocate(allocator, buffer);
}
private:
template<typename allocator_ptr> /* break interface dependency with TF */
static void* allocate(allocator_ptr allocator, size_t size) {
void* result;
if (NULL != allocator) {
/* no (useless) waste with alignment; raw result is re-aligned anyways */
result = allocator->AllocateRaw(1/*alignment*/, size);
}
else {
LIBXSMM_ASSERT_MSG(0/*false*/, "LIBXSMM ERROR: memory allocator is missing");
result = NULL;
}
return result;
}
template<typename allocator_ptr> /* break interface dependency with TF */
static void deallocate(allocator_ptr allocator, void* buffer) {
LIBXSMM_ASSERT_MSG(NULL != allocator, "LIBXSMM ERROR: memory allocator is missing");
if (NULL != allocator) allocator->DeallocateRaw(buffer);
}
};
#endif /*defined(__cplusplus)*/
#endif /*LIBXSMM_MALLOC_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_MATH_H
#define LIBXSMM_MATH_H
#include "libxsmm_typedefs.h"
/**
* Structure of differences with matrix norms according
* to http://www.netlib.org/lapack/lug/node75.html).
*/
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_matdiff_info {
/** One-norm */ double norm1_abs, norm1_rel;
/** Infinity-norm */ double normi_abs, normi_rel;
/** Froebenius-norm */ double normf_rel;
/** Maximum difference, L2-norm (absolute and relative), and R-squared. */
double linf_abs, linf_rel, l2_abs, l2_rel, rsq;
/** Statistics: sum/l1, min., max., arith. avg., and variance. */
double l1_ref, min_ref, max_ref, avg_ref, var_ref;
/** Statistics: sum/l1, min., max., arith. avg., and variance. */
double l1_tst, min_tst, max_tst, avg_tst, var_tst;
/** Values (v_ref, v_tst) and location (m, n) of largest linf_abs. */
double v_ref, v_tst;
libxsmm_blasint m, n;
} libxsmm_matdiff_info;
/**
* Utility function to calculate a collection of scalar differences between two matrices (libxsmm_matdiff_info).
* The location (m, n) of the largest difference (linf_abs) is recorded (also in case of NaN). In case of NaN,
* differences are set to infinity. If no difference is discovered, the location (m, n) is negative (OOB).
*/
LIBXSMM_API int libxsmm_matdiff(libxsmm_matdiff_info* info,
libxsmm_datatype datatype, libxsmm_blasint m, libxsmm_blasint n, const void* ref, const void* tst,
const libxsmm_blasint* ldref, const libxsmm_blasint* ldtst);
/**
* Reduces input into output such that the difference is maintained or increased (max function).
* The very first (initial) output should be zeroed (libxsmm_matdiff_clear).
*/
LIBXSMM_API void libxsmm_matdiff_reduce(libxsmm_matdiff_info* output, const libxsmm_matdiff_info* input);
/** Clears the given info-structure, e.g., for the initial reduction-value (libxsmm_matdiff_reduce). */
LIBXSMM_API void libxsmm_matdiff_clear(libxsmm_matdiff_info* info);
/** Greatest common divisor (corner case: the GCD of 0 and 0 is 1). */
LIBXSMM_API size_t libxsmm_gcd(size_t a, size_t b);
/** Least common multiple. */
LIBXSMM_API size_t libxsmm_lcm(size_t a, size_t b);
/**
* This function finds prime-factors (up to 32) of an unsigned integer in ascending order, and
* returns the number of factors found (zero if the given number is prime and unequal to two).
*/
LIBXSMM_API int libxsmm_primes_u32(unsigned int num, unsigned int num_factors_n32[]);
/** Calculate co-prime number <= n/2 (except: libxsmm_shuffle(0|1) == 0). */
LIBXSMM_API size_t libxsmm_shuffle(unsigned int n);
/**
* Divides the product into prime factors and selects factors such that the new product is within
* the given limit (0/1-Knapsack problem), e.g., product=12=2*2*3 and limit=6 then result=2*3=6.
* The limit is at least reached or exceeded with the minimal possible product (is_lower=true).
*/
LIBXSMM_API unsigned int libxsmm_product_limit(unsigned int product, unsigned int limit, int is_lower);
/* Kahan's summation returns accumulator += value and updates compensation. */
LIBXSMM_API double libxsmm_kahan_sum(double value, double* accumulator, double* compensation);
/** SQRT with Newton's method using integer arithmetic. */
LIBXSMM_API unsigned int libxsmm_isqrt_u64(unsigned long long x);
/** SQRT with Newton's method using integer arithmetic. */
LIBXSMM_API unsigned int libxsmm_isqrt_u32(unsigned int x);
/** Based on libxsmm_isqrt_u32, but actual factor of x. */
LIBXSMM_API unsigned int libxsmm_isqrt2_u32(unsigned int x);
/** SQRT with Newton's method using double-precision. */
LIBXSMM_API double libxsmm_dsqrt(double x);
/** SQRT with Newton's method using single-precision. */
LIBXSMM_API float libxsmm_ssqrt(float x);
/** CBRT with Newton's method using integer arithmetic. */
LIBXSMM_API unsigned int libxsmm_icbrt_u64(unsigned long long x);
/** CBRT with Newton's method using integer arithmetic. */
LIBXSMM_API unsigned int libxsmm_icbrt_u32(unsigned int x);
/** Single-precision approximation of exponential function (base 2). */
LIBXSMM_API float libxsmm_sexp2(float x);
/**
* Exponential function (base 2), which is limited to unsigned 8-bit input values.
* This function reproduces bit-accurate results (single-precision).
*/
LIBXSMM_API float libxsmm_sexp2_u8(unsigned char x);
/**
* Exponential function (base 2), which is limited to signed 8-bit input values.
* This function reproduces bit-accurate results (single-precision).
*/
LIBXSMM_API float libxsmm_sexp2_i8(signed char x);
/** Similar to libxsmm_sexp2_i8, but takes an integer as signed 8-bit value (check). */
LIBXSMM_API float libxsmm_sexp2_i8i(int x);
/** Inlineable fast tanh, such that a the compiler can potentially vectorize. */
LIBXSMM_API_INLINE float libxsmm_stanh_pade78(float i_x) {
const float l_c0 = 2027025.0f;
const float l_c1 = 270270.0f;
const float l_c2 = 6930.0f;
const float l_c3 = 36.0f;
const float l_c1_d = 945945.0f;
const float l_c2_d = 51975.0f;
const float l_c3_d = 630.0f;
const float l_hi_bound = 4.97f;
const float l_lo_bound = -4.97f;
const float l_ones = 1.0f;
const float l_neg_ones = -1.0f;
const float x2 = i_x * i_x;
const float t1_nom = (l_c3 * x2) + l_c2;
const float t2_nom = (t1_nom * x2) + l_c1;
const float t3_nom = (t2_nom * x2) + l_c0;
const float nom = t3_nom * i_x;
const float t1_denom = x2 + l_c3_d;
const float t2_denom = (t1_denom * x2) + l_c2_d;
const float t3_denom = (t2_denom * x2) + l_c1_d;
const float denom = (t3_denom * x2) + l_c0;
float result = nom/denom ;
result = (result > l_hi_bound) ? l_ones : result;
result = (result < l_lo_bound) ? l_neg_ones : result;
return result;
}
#endif /*LIBXSMM_MATH_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_MEMORY_H
#define LIBXSMM_MEMORY_H
#include "libxsmm_macros.h"
#if defined(__clang_analyzer__)
# define LIBXSMM_MEMSET127(PTRDST, VALUE, SIZE) memset((void*)(PTRDST), VALUE, SIZE)
#else
# define LIBXSMM_MEMSET127(PTRDST, VALUE, SIZE) { \
char *const libxsmm_memset127_dst_ = (char*)(PTRDST); \
union { size_t size; signed char size1; } libxsmm_memset127_; \
signed char libxsmm_memset127_i_; LIBXSMM_ASSERT((SIZE) <= 127); \
libxsmm_memset127_.size = (SIZE); \
LIBXSMM_PRAGMA_UNROLL \
for (libxsmm_memset127_i_ = 0; libxsmm_memset127_i_ < libxsmm_memset127_.size1; \
++libxsmm_memset127_i_) \
{ \
libxsmm_memset127_dst_[libxsmm_memset127_i_] = (char)(VALUE); \
} \
}
#endif
#define LIBXSMM_MEMZERO127(PTRDST) LIBXSMM_MEMSET127(PTRDST, '\0', sizeof(*(PTRDST)))
#define LIBXSMM_MEMCPY127_LOOP(PTRDST, PTRSRC, SIZE, NTS) { \
const unsigned char *const libxsmm_memcpy127_loop_src_ = (const unsigned char*)(PTRSRC); \
unsigned char *const libxsmm_memcpy127_loop_dst_ = (unsigned char*)(PTRDST); \
signed char libxsmm_memcpy127_loop_i_; LIBXSMM_ASSERT((SIZE) <= 127); \
NTS(libxsmm_memcpy127_loop_dst_) LIBXSMM_PRAGMA_UNROLL \
for (libxsmm_memcpy127_loop_i_ = 0; libxsmm_memcpy127_loop_i_ < (signed char)(SIZE); \
++libxsmm_memcpy127_loop_i_) \
{ \
libxsmm_memcpy127_loop_dst_[libxsmm_memcpy127_loop_i_] = \
libxsmm_memcpy127_loop_src_[libxsmm_memcpy127_loop_i_]; \
} \
}
#define LIBXSMM_MEMCPY127_NTS(...)
#define LIBXSMM_MEMCPY127(PTRDST, PTRSRC, SIZE) \
LIBXSMM_MEMCPY127_LOOP(PTRDST, PTRSRC, SIZE, LIBXSMM_MEMCPY127_NTS)
#define LIBXSMM_ASSIGN127(PTRDST, PTRSRC) LIBXSMM_ASSERT(sizeof(*(PTRSRC)) <= sizeof(*(PTRDST))); \
LIBXSMM_MEMCPY127(PTRDST, PTRSRC, sizeof(*(PTRSRC)))
/**
* Calculates if there is a difference between two (short) buffers.
* Returns zero if there is no difference; otherwise non-zero.
*/
LIBXSMM_API unsigned char libxsmm_diff(const void* a, const void* b, unsigned char size);
/**
* Calculates if there is a difference between "a" and "n x b".
* Returns the index of the first match (or "n" in case of no match).
*/
LIBXSMM_API unsigned int libxsmm_diff_n(const void* a, const void* bn, unsigned char size,
unsigned char stride, unsigned int hint, unsigned int n);
/** Similar to memcmp (C standard library), but the result is conceptually only a boolean. */
LIBXSMM_API int libxsmm_memcmp(const void* a, const void* b, size_t size);
/** Calculate a hash value for the given buffer and seed; accepts NULL-buffer. */
LIBXSMM_API unsigned int libxsmm_hash(const void* data, unsigned int size, unsigned int seed);
/** Calculate a 64-bit hash for the given character string; accepts NULL-string. */
LIBXSMM_API unsigned long long libxsmm_hash_string(const char* string);
/** Return the pointer to the 1st match of "b" in "a", or NULL (no match). */
LIBXSMM_API const char* libxsmm_stristr(const char* a, const char* b);
/**
* Check if pointer is SIMD-aligned and optionally consider the next access (increment in Bytes).
* Optionally calculates the alignment of the given pointer in Bytes.
*/
LIBXSMM_API int libxsmm_aligned(const void* ptr, const size_t* inc, int* alignment);
#endif /*LIBXSMM_MEMORY_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_MHD_H
#define LIBXSMM_MHD_H
#include "libxsmm_typedefs.h"
/** Denotes the element/pixel type of an image/channel. */
typedef enum libxsmm_mhd_elemtype {
LIBXSMM_MHD_ELEMTYPE_F64 = LIBXSMM_DATATYPE_F64, /* MET_DOUBLE */
LIBXSMM_MHD_ELEMTYPE_F32 = LIBXSMM_DATATYPE_F32, /* MET_FLOAT */
LIBXSMM_MHD_ELEMTYPE_BF16 = LIBXSMM_DATATYPE_BF16, /* MET_BFLOAT */
LIBXSMM_MHD_ELEMTYPE_I64 = LIBXSMM_DATATYPE_I64, /* MET_LONG */
LIBXSMM_MHD_ELEMTYPE_I32 = LIBXSMM_DATATYPE_I32, /* MET_INT */
LIBXSMM_MHD_ELEMTYPE_I16 = LIBXSMM_DATATYPE_I16, /* MET_SHORT */
LIBXSMM_MHD_ELEMTYPE_I8 = LIBXSMM_DATATYPE_I8, /* MET_CHAR */
LIBXSMM_MHD_ELEMTYPE_U64 = LIBXSMM_DATATYPE_UNSUPPORTED, /* MET_ULONG */
LIBXSMM_MHD_ELEMTYPE_U32, /* MET_UINT */
LIBXSMM_MHD_ELEMTYPE_U16, /* MET_USHORT */
LIBXSMM_MHD_ELEMTYPE_U8, /* MET_UCHAR */
LIBXSMM_MHD_ELEMTYPE_UNKNOWN
} libxsmm_mhd_elemtype;
/**
* Function type used for custom data-handler or element conversion.
* The value-range (src_min, src_max) may be used to scale values
* in case of a type-conversion.
*/
LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE int (*libxsmm_mhd_element_handler)(
void* dst, libxsmm_mhd_elemtype dst_type, libxsmm_mhd_elemtype src_type,
const void* src, const void* src_min, const void* src_max);
/**
* Predefined function to perform element data conversion.
* Scales source-values in case of non-NULL src_min and src_max,
* or otherwise clamps to the destination-type.
*/
LIBXSMM_API int libxsmm_mhd_element_conversion(
void* dst, libxsmm_mhd_elemtype dst_type, libxsmm_mhd_elemtype src_type,
const void* src, const void* src_min, const void* src_max);
/**
* Predefined function to check a buffer against file content.
* In case of different types, libxsmm_mhd_element_conversion
* is performed to compare values using the source-type.
*/
LIBXSMM_API int libxsmm_mhd_element_comparison(
void* dst, libxsmm_mhd_elemtype dst_type, libxsmm_mhd_elemtype src_type,
const void* src, const void* src_min, const void* src_max);
/** Returns the name and size of the element type; result may be NULL/0 in case of an unknown type. */
LIBXSMM_API const char* libxsmm_mhd_typename(libxsmm_mhd_elemtype type, size_t* typesize, const char** ctypename);
/** Returns the type of the element for a given type-name. */
LIBXSMM_API libxsmm_mhd_elemtype libxsmm_mhd_typeinfo(const char elemname[]);
/**
* Parse the header of an MHD-file. The header can be part of the data file (local),
* or separately stored (header: MHD, data MHA or RAW).
*/
LIBXSMM_API int libxsmm_mhd_read_header(
/* Filename referring to the header-file (may also contain the data). */
const char header_filename[],
/* Maximum length of path/file name. */
size_t filename_max_length,
/* Filename containing the data (may be the same as the header-file). */
char filename[],
/* Yields the maximum/possible number of dimensions on input,
* and the actual number of dimensions on output. */
size_t* ndims,
/* Image extents ("ndims" number of entries). */
size_t size[],
/* Number of interleaved image channels. */
size_t* ncomponents,
/* Type of the image elements (pixel type). */
libxsmm_mhd_elemtype* type,
/* Size of the header in bytes; may be used to skip the header,
* when reading content; can be a NULL-argument (optional). */
size_t* header_size,
/* Size (in Bytes) of an user-defined extended data record;
* can be a NULL-argument (optional). */
size_t* extension_size);
/**
* Loads the data file, and optionally allows data conversion.
* Conversion is performed such that values are clamped to fit
* into the destination.
*/
LIBXSMM_API int libxsmm_mhd_read(
/* Filename referring to the data. */
const char filename[],
/* Offset within pitched buffer (NULL: no offset). */
const size_t offset[],
/* Image dimensions (extents). */
const size_t size[],
/* Leading buffer dimensions (NULL: same as size). */
const size_t pitch[],
/* Dimensionality (number of entries in size). */
size_t ndims,
/* Number of interleaved image channels. */
size_t ncomponents,
/* Used to skip the header, and to only read the data. */
size_t header_size,
/* Data element type as stored (pixel type). */
libxsmm_mhd_elemtype type_stored,
/* Storage type (data conversion, optional). */
const libxsmm_mhd_elemtype* type_data,
/* Buffer where the data is read into. */
void* data,
/**
* Optional callback executed per entry when reading the data.
* May assign the value to the left-most argument, but also
* allows to only compare with present data. Can be used to
* avoid allocating an actual destination.
*/
libxsmm_mhd_element_handler handle_element,
/* Post-content data (extension, optional). */
char extension[],
/* Size of the extension; can be zero. */
size_t extension_size);
/**
* Save a file using an extended data format, which is compatible with the Meta Image Format (MHD).
* The file is suitable for visual inspection using, e.g., ITK-SNAP or ParaView.
*/
LIBXSMM_API int libxsmm_mhd_write(const char filename[],
/* Offset within pitched buffer (NULL: no offset). */
const size_t offset[],
/* Image dimensions (extents). */
const size_t size[],
/* Leading buffer dimensions (NULL: same as size). */
const size_t pitch[],
/* Dimensionality, i.e., number of entries in data_size/size. */
size_t ndims,
/* Number of pixel components. */
size_t ncomponents,
/* Type (input). */
libxsmm_mhd_elemtype type_data,
/* Type (data conversion, optional). */
const libxsmm_mhd_elemtype* type,
/* Raw data to be saved. */
const void* data,
/* Size of the header; can be a NULL-argument (optional). */
size_t* header_size,
/* Extension header data; can be NULL. */
const char extension_header[],
/* Extension data stream; can be NULL. */
const void* extension,
/* Extension data size; can be NULL. */
size_t extension_size);
#endif /*LIBXSMM_MHD_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke, Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_RNG_H
#define LIBXSMM_RNG_H
#include "libxsmm_typedefs.h"
/**
* create a new external state for thread-save execution managed
* by the user. We do not provide a function for drawing the random numbers
* the user is supposed to call the LIBXSMM_INTRINSICS_MM512_RNG_EXTSTATE_PS
* or LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EXTSTATE_EPI32 intrinsic.
* */
LIBXSMM_API unsigned int* libxsmm_rng_create_extstate(unsigned int/*uint32_t*/ seed);
/** free a previously created rng_avx512_extstate */
LIBXSMM_API void libxsmm_rng_destroy_extstate(unsigned int* stateptr);
/** Set the seed of libxsmm_rng_* (similar to srand). */
LIBXSMM_API void libxsmm_rng_set_seed(unsigned int/*uint32_t*/ seed);
/**
* This SP-RNG is using xoshiro128+ 1.0, work done by
* David Blackman and Sebastiano Vigna (vigna@acm.org).
* It is their best and fastest 32-bit generator for
* 32-bit floating-point numbers. They suggest to use
* its upper bits for floating-point generation, what
* we do here and generate numbers in [0,1(.
*/
LIBXSMM_API void libxsmm_rng_f32_seq(float* rngs, libxsmm_blasint count);
/**
* Returns a (pseudo-)random value based on rand/rand48 in the interval [0, n).
* This function compensates for an n, which is not a factor of RAND_MAX.
* Note: libxsmm_rng_set_seed must be used if one wishes to seed the generator.
*/
LIBXSMM_API unsigned int libxsmm_rng_u32(unsigned int n);
/** Sequence of random data based on libxsmm_rng_u32. */
LIBXSMM_API void libxsmm_rng_seq(void* data, libxsmm_blasint nbytes);
/**
* Similar to libxsmm_rng_u32, but returns a DP-value in the interval [0, 1).
* Note: libxsmm_rng_set_seed must be used if one wishes to seed the generator.
*/
LIBXSMM_API double libxsmm_rng_f64(void);
#endif /* LIBXSMM_RNG_H */
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_SOURCE_H
#define LIBXSMM_SOURCE_H
#if defined(LIBXSMM_MACROS_H)
# error Please do not include any LIBXSMM header other than libxsmm_source.h!
#endif
#if defined(LIBXSMM_BUILD)
# error LIBXSMM_BUILD cannot be defined for the header-only LIBXSMM!
#endif
/**
* This header is intentionally called "libxsmm_source.h" since the followings block
* includes *internal* files, and thereby exposes LIBXSMM's implementation.
* The so-called "header-only" usage model gives up the clearly defined binary interface
* (including support for hot-fixes after deployment), and requires to rebuild client
* code for every (internal) change of LIBXSMM. Please make sure to only rely on the
* public interface as the internal implementation may change without notice.
*/
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
#endif
#include "../src/generator_aarch64_instructions.c"
#include "../src/generator_common.c"
#include "../src/generator_common_aarch64.c"
#include "../src/generator_common_x86.c"
#include "../src/generator_gemm.c"
#include "../src/generator_gemm_aarch64.c"
#include "../src/generator_gemm_amx.c"
#include "../src/generator_gemm_amx_emu.c"
#include "../src/generator_gemm_amx_microkernel.c"
#include "../src/generator_gemm_amx_microkernel_emu.c"
#include "../src/generator_gemm_avx2_microkernel.c"
#include "../src/generator_gemm_avx512_microkernel.c"
#include "../src/generator_gemm_avx_microkernel.c"
#include "../src/generator_gemm_common.c"
#include "../src/generator_gemm_common_aarch64.c"
#include "../src/generator_gemm_noarch.c"
#include "../src/generator_gemm_sse_avx_avx2_avx512.c"
#include "../src/generator_gemm_sse_microkernel.c"
#include "../src/generator_mateltwise.c"
#include "../src/generator_mateltwise_misc_avx_avx512.c"
#include "../src/generator_mateltwise_reduce_avx_avx512.c"
#include "../src/generator_mateltwise_sse_avx_avx512.c"
#include "../src/generator_mateltwise_transform_avx.c"
#include "../src/generator_mateltwise_transform_avx512.c"
#include "../src/generator_mateltwise_transform_common.c"
#include "../src/generator_mateltwise_transform_common_x86.c"
#include "../src/generator_mateltwise_transform_sse.c"
#include "../src/generator_mateltwise_unary_binary_avx_avx512.c"
#include "../src/generator_matequation.c"
#include "../src/generator_matequation_avx_avx512.c"
#include "../src/generator_matequation_regblocks_avx_avx512.c"
#include "../src/generator_matequation_scratch_avx_avx512.c"
#include "../src/generator_packed_gemm_ac_rm.c"
#include "../src/generator_packed_gemm_ac_rm_aarch64.c"
#include "../src/generator_packed_gemm_ac_rm_avx_avx2_avx512.c"
#include "../src/generator_packed_gemm_bc_rm.c"
#include "../src/generator_packed_gemm_bc_rm_aarch64.c"
#include "../src/generator_packed_gemm_bc_rm_avx_avx2_avx512.c"
#include "../src/generator_packed_spgemm.c"
#include "../src/generator_packed_spgemm_csc_bsparse.c"
#include "../src/generator_packed_spgemm_csc_bsparse_aarch64.c"
#include "../src/generator_packed_spgemm_csc_bsparse_avx_avx2_avx512.c"
#include "../src/generator_packed_spgemm_csc_csparse.c"
#include "../src/generator_packed_spgemm_csc_csparse_avx_avx2_avx512.c"
#include "../src/generator_packed_spgemm_csr_asparse.c"
#include "../src/generator_packed_spgemm_csr_asparse_aarch64.c"
#include "../src/generator_packed_spgemm_csr_asparse_avx_avx2_avx512.c"
#include "../src/generator_packed_spgemm_csr_bsparse.c"
#include "../src/generator_packed_spgemm_csr_bsparse_aarch64.c"
#include "../src/generator_packed_spgemm_csr_bsparse_avx_avx2_avx512.c"
#include "../src/generator_spgemm.c"
#include "../src/generator_spgemm_csc_asparse.c"
#include "../src/generator_spgemm_csc_bsparse.c"
#include "../src/generator_spgemm_csc_reader.c"
#include "../src/generator_spgemm_csr_asparse.c"
#include "../src/generator_spgemm_csr_asparse_reg.c"
#include "../src/generator_spgemm_csr_reader.c"
#include "../src/generator_x86_instructions.c"
#include "../src/libxsmm_cpuid_arm.c"
#include "../src/libxsmm_cpuid_x86.c"
#include "../src/libxsmm_dnn.c"
#include "../src/libxsmm_dnn_convolution.c"
#include "../src/libxsmm_dnn_convolution_backward.c"
#include "../src/libxsmm_dnn_convolution_forward.c"
#include "../src/libxsmm_dnn_convolution_weight_update.c"
#include "../src/libxsmm_dnn_elementwise.c"
#include "../src/libxsmm_dnn_fullyconnected.c"
#include "../src/libxsmm_dnn_fullyconnected_backward_weight_update.c"
#include "../src/libxsmm_dnn_fullyconnected_forward.c"
#include "../src/libxsmm_dnn_fusedbatchnorm.c"
#include "../src/libxsmm_dnn_fusedbatchnorm_backward.c"
#include "../src/libxsmm_dnn_fusedbatchnorm_forward.c"
#include "../src/libxsmm_dnn_fusedgroupnorm.c"
#include "../src/libxsmm_dnn_fusedgroupnorm_backward.c"
#include "../src/libxsmm_dnn_fusedgroupnorm_forward.c"
#include "../src/libxsmm_dnn_optimizer.c"
#include "../src/libxsmm_dnn_optimizer_sgd.c"
#include "../src/libxsmm_dnn_pooling.c"
#include "../src/libxsmm_dnn_pooling_backward.c"
#include "../src/libxsmm_dnn_pooling_forward.c"
#include "../src/libxsmm_dnn_rnncell.c"
#include "../src/libxsmm_dnn_rnncell_backward_weight_update.c"
#include "../src/libxsmm_dnn_rnncell_forward.c"
#include "../src/libxsmm_dnn_softmaxloss.c"
#include "../src/libxsmm_dnn_softmaxloss_backward.c"
#include "../src/libxsmm_dnn_softmaxloss_forward.c"
#include "../src/libxsmm_dnn_tensor.c"
#include "../src/libxsmm_ext.c"
#include "../src/libxsmm_ext_gemm.c"
#include "../src/libxsmm_ext_xcopy.c"
#include "../src/libxsmm_fsspmdm.c"
#include "../src/libxsmm_gemm.c"
#include "../src/libxsmm_generator.c"
#include "../src/libxsmm_hash.c"
#include "../src/libxsmm_main.c"
#include "../src/libxsmm_malloc.c"
#include "../src/libxsmm_math.c"
#include "../src/libxsmm_matrixeqn.c"
#include "../src/libxsmm_memory.c"
#include "../src/libxsmm_mhd.c"
#include "../src/libxsmm_perf.c"
#include "../src/libxsmm_python.c"
#include "../src/libxsmm_rng.c"
#include "../src/libxsmm_spmdm.c"
#include "../src/libxsmm_sync.c"
#include "../src/libxsmm_timer.c"
#include "../src/libxsmm_trace.c"
#include "../src/libxsmm_xcopy.c"
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(pop)
#endif
#endif /*LIBXSMM_SOURCE_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Nadathur Satish (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_SPMDM_H
#define LIBXSMM_SPMDM_H
#include "libxsmm_typedefs.h"
typedef enum libxsmm_spmdm_datatype {
LIBXSMM_SPMDM_DATATYPE_F32,
LIBXSMM_SPMDM_DATATYPE_BFLOAT16
} libxsmm_spmdm_datatype;
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_spmdm_handle {
/* The following are the matrix multiply dimensions: A (sparse): m X k, B (dense): k X n, Output C (dense): m X n */
int m;
int n;
int k;
/* The block sizes for A, B and C. */
/* Here we fix A to be divided into 128 X 128 blocks, B/C to be 128 X 48 for HSW/BDW and 128 X 96 for SKX */
int bm;
int bn;
int bk;
/* The number of blocks for the m, n and k dimensions */
int mb;
int nb;
int kb;
libxsmm_spmdm_datatype datatype;
char* base_ptr_scratch_A;
char* base_ptr_scratch_B_scratch_C;
int memory_for_scratch_per_thread;
} libxsmm_spmdm_handle;
/**
* This stores a single sparse splice (or block) of sparse matrix A using a CSR representation (rowidx, colidx, and values
* Each splice corresponds to a bm X bk region of A, and stores local indexes
*/
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_CSR_sparseslice {
/* Since bm and bk are assumed to be <=256, a 16-bit integer is enough to store the local rowidx, colidx */
uint16_t* rowidx;
uint16_t* colidx;
float* values;
} libxsmm_CSR_sparseslice;
LIBXSMM_API void libxsmm_spmdm_init(
int M, int N, int K,
int max_threads,
libxsmm_spmdm_handle* handle,
libxsmm_CSR_sparseslice** libxsmm_output_csr);
LIBXSMM_API void libxsmm_spmdm_destroy(
libxsmm_spmdm_handle* handle);
LIBXSMM_API int libxsmm_spmdm_get_num_createSparseSlice_blocks(
const libxsmm_spmdm_handle* handle);
LIBXSMM_API int libxsmm_spmdm_get_num_compute_blocks(
const libxsmm_spmdm_handle* handle);
/** This converts a dense representation of the sparse matrix to 2D array of sparse slices. */
LIBXSMM_API void libxsmm_spmdm_createSparseSlice_fp32_thread(
const libxsmm_spmdm_handle* handle,
char transa,
const float* a,
libxsmm_CSR_sparseslice* libxsmm_output_csr_a,
int block_id,
int tid, int nthreads);
LIBXSMM_API void libxsmm_spmdm_createSparseSlice_bfloat16_thread(
const libxsmm_spmdm_handle* handle,
char transa,
const libxsmm_bfloat16* a,
libxsmm_CSR_sparseslice* libxsmm_output_csr_a,
int block_id,
int tid, int nthreads);
/** NOTE: This code currently ignores alpha input to the matrix multiply */
LIBXSMM_API void libxsmm_spmdm_compute_fp32_thread(
const libxsmm_spmdm_handle* handle,
char transa,
char transb,
const float* alpha,
libxsmm_CSR_sparseslice* a_sparse,
const float* b,
char transc,
const float* beta,
float* c,
int block_id,
int tid, int nthreads);
/** NOTE: This code currently ignores alpha input to the matrix multiply */
LIBXSMM_API void libxsmm_spmdm_compute_bfloat16_thread(
const libxsmm_spmdm_handle* handle,
char transa,
char transb,
const libxsmm_bfloat16* alpha,
libxsmm_CSR_sparseslice* a_sparse,
const libxsmm_bfloat16* b,
char transc,
const libxsmm_bfloat16* beta,
float* c,
int block_id,
int tid, int nthreads);
#endif /*LIBXSMM_SPMDM_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_SYNC_H
#define LIBXSMM_SYNC_H
#include "libxsmm_intrinsics_x86.h"
#if !defined(LIBXSMM_TLS)
# if (0 != LIBXSMM_SYNC) && !defined(LIBXSMM_NO_TLS)
# if defined(__CYGWIN__) && defined(__clang__)
# define LIBXSMM_NO_TLS
# define LIBXSMM_TLS
# else
# if (defined(_WIN32) && !defined(__GNUC__) && !defined(__clang__)) || (defined(__PGI) && !defined(__cplusplus))
# define LIBXSMM_TLS LIBXSMM_ATTRIBUTE(thread)
# elif defined(__GNUC__) || defined(__clang__) || defined(_CRAYC)
# define LIBXSMM_TLS __thread
# elif defined(__cplusplus)
# define LIBXSMM_TLS thread_local
# else
# error Missing TLS support!
# endif
# endif
# else
# if !defined(LIBXSMM_NO_TLS)
# define LIBXSMM_NO_TLS
# endif
# define LIBXSMM_TLS
# endif
#endif
#if !defined(LIBXSMM_GCC_BASELINE) && !defined(LIBXSMM_SYNC_LEGACY) && ((defined(_WIN32) && defined(__clang__)) || \
(defined(__GNUC__) && LIBXSMM_VERSION2(4, 7) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)))
# define LIBXSMM_GCC_BASELINE
#endif
#if defined(__MIC__)
# define LIBXSMM_SYNC_PAUSE _mm_delay_32(8/*delay*/)
#elif !defined(LIBXSMM_INTRINSICS_NONE)
# if defined(LIBXSMM_GCC_BASELINE) && !defined(__INTEL_COMPILER)
# define LIBXSMM_SYNC_PAUSE __builtin_ia32_pause()
# else
# define LIBXSMM_SYNC_PAUSE _mm_pause()
# endif
#elif (LIBXSMM_X86_GENERIC <= LIBXSMM_STATIC_TARGET_ARCH) && defined(__GNUC__)
# define LIBXSMM_SYNC_PAUSE __asm__ __volatile__("pause" ::: "memory")
#else
# define LIBXSMM_SYNC_PAUSE
#endif
/* permit thread-unsafe */
#if !defined(LIBXSMM_SYNC_NONE) && ( \
(defined(__PGI) && (!defined(LIBXSMM_LIBATOMIC) || !defined(__STATIC))) || \
(defined(_CRAYC) && !defined(__GNUC__)))
# define LIBXSMM_SYNC_NONE
#endif
#if !defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP) && 0
# define LIBXSMM_ATOMIC_TRYLOCK_CMPSWP
#endif
#if !defined(LIBXSMM_ATOMIC_ZERO_STORE) && defined(_CRAYC)
# define LIBXSMM_ATOMIC_ZERO_STORE
#endif
#if !defined(LIBXSMM_ATOMIC_LOCKTYPE)
# if defined(_WIN32) || 1/*alignment*/
# define LIBXSMM_ATOMIC_LOCKTYPE int
# else
# define LIBXSMM_ATOMIC_LOCKTYPE char
# endif
#endif
typedef enum libxsmm_atomic_kind {
#if defined(__ATOMIC_SEQ_CST)
LIBXSMM_ATOMIC_SEQ_CST = __ATOMIC_SEQ_CST,
#else
LIBXSMM_ATOMIC_SEQ_CST = 0,
#endif
#if defined(__ATOMIC_RELAXED)
LIBXSMM_ATOMIC_RELAXED = __ATOMIC_RELAXED
#else
LIBXSMM_ATOMIC_RELAXED = LIBXSMM_ATOMIC_SEQ_CST
#endif
} libxsmm_atomic_kind;
#define LIBXSMM_NONATOMIC_LOCKTYPE LIBXSMM_ATOMIC_LOCKTYPE
#define LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND) (*(SRC_PTR))
#define LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND) { LIBXSMM_UNUSED(KIND); *(DST_PTR) = (VALUE); }
#define LIBXSMM_NONATOMIC_STORE_ZERO(DST_PTR, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, 0, KIND)
#define LIBXSMM_NONATOMIC_FETCH_OR(DST_PTR, VALUE/*side-effect*/, KIND) (/* 1st step: swap(dst, val) */ \
((*DST_PTR) = (*DST_PTR) ^ (VALUE)), (VALUE = (VALUE) ^ (*DST_PTR)), ((*DST_PTR) = (*DST_PTR) ^ (VALUE)), \
(*(DST_PTR) |= VALUE), (VALUE) /* 2nd step: or, and 3rd/last step: original dst-value */)
#define LIBXSMM_NONATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) (*(DST_PTR) += VALUE)
#define LIBXSMM_NONATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) (*(DST_PTR) -= VALUE)
#define LIBXSMM_NONATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) (LIBXSMM_NONATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND), (*(DST_PTR) - (VALUE)))
#define LIBXSMM_NONATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) (LIBXSMM_NONATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND), (*(DST_PTR) + (VALUE)))
#define LIBXSMM_NONATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) ((NEWVAL) == (*(DST_PTR) == (OLDVAL) ? (*(DST_PTR) = (NEWVAL)) : (OLDVAL)))
#define LIBXSMM_NONATOMIC_TRYLOCK(DST_PTR, KIND) LIBXSMM_NONATOMIC_CMPSWP(DST_PTR, 0, 1, KIND)
#define LIBXSMM_NONATOMIC_ACQUIRE(DST_PTR, NPAUSE, KIND) { LIBXSMM_UNUSED(NPAUSE); \
LIBXSMM_ASSERT_MSG(0 == *(DST_PTR), "LIBXSMM_NONATOMIC_ACQUIRE"); LIBXSMM_NONATOMIC_STORE(DST_PTR, 1, KIND); \
LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_NONATOMIC_ACQUIRE"); }
#define LIBXSMM_NONATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_UNUSED(DST_PTR); LIBXSMM_UNUSED(KIND); \
LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_NONATOMIC_RELEASE"); LIBXSMM_NONATOMIC_STORE(DST_PTR, 0, KIND); \
LIBXSMM_ASSERT_MSG(0 == *(DST_PTR), "LIBXSMM_NONATOMIC_RELEASE"); }
#define LIBXSMM_NONATOMIC_SYNC(KIND) LIBXSMM_UNUSED(KIND)
#if (0 == LIBXSMM_SYNC) || defined(LIBXSMM_SYNC_NONE)
# define LIBXSMM_ATOMIC(FN, BITS) FN
# define LIBXSMM_ATOMIC_LOAD LIBXSMM_NONATOMIC_LOAD
# define LIBXSMM_ATOMIC_STORE LIBXSMM_NONATOMIC_STORE
# define LIBXSMM_ATOMIC_STORE_ZERO LIBXSMM_NONATOMIC_STORE_ZERO
# define LIBXSMM_ATOMIC_FETCH_OR LIBXSMM_NONATOMIC_FETCH_OR
# define LIBXSMM_ATOMIC_ADD_FETCH LIBXSMM_NONATOMIC_ADD_FETCH
# define LIBXSMM_ATOMIC_SUB_FETCH LIBXSMM_NONATOMIC_SUB_FETCH
# define LIBXSMM_ATOMIC_FETCH_ADD LIBXSMM_NONATOMIC_FETCH_ADD
# define LIBXSMM_ATOMIC_FETCH_SUB LIBXSMM_NONATOMIC_FETCH_SUB
# define LIBXSMM_ATOMIC_CMPSWP LIBXSMM_NONATOMIC_CMPSWP
# define LIBXSMM_ATOMIC_TRYLOCK LIBXSMM_NONATOMIC_TRYLOCK
# define LIBXSMM_ATOMIC_ACQUIRE LIBXSMM_NONATOMIC_ACQUIRE
# define LIBXSMM_ATOMIC_RELEASE LIBXSMM_NONATOMIC_RELEASE
# define LIBXSMM_ATOMIC_SYNC LIBXSMM_NONATOMIC_SYNC
# if !defined(LIBXSMM_SYNC_NPAUSE)
# define LIBXSMM_SYNC_NPAUSE 0
# endif
#elif (defined(LIBXSMM_GCC_BASELINE) || defined(LIBXSMM_LIBATOMIC) /* GNU's libatomic required */ || \
(defined(__GNUC__) && LIBXSMM_VERSION2(4, 1) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)))
# if defined(LIBXSMM_LIBATOMIC)
# define LIBXSMM_ATOMIC(FN, BITS) LIBXSMM_CONCATENATE(LIBXSMM_ATOMIC, BITS)(FN)
# define LIBXSMM_ATOMIC8(FN) LIBXSMM_CONCATENATE(FN, 8)
# define LIBXSMM_ATOMIC16(FN) LIBXSMM_CONCATENATE(FN, 16)
# define LIBXSMM_ATOMIC32(FN) FN/*default*/
# define LIBXSMM_ATOMIC64(FN) LIBXSMM_CONCATENATE(FN, 64)
# if defined(__PGI)
# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_LOAD8(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_LOAD16(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_LOAD64(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_STORE8(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_STORE16(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_STORE64(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND)
# else
# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) __atomic_load_4(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_LOAD8(SRC_PTR, KIND) __atomic_load_1(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_LOAD16(SRC_PTR, KIND) __atomic_load_2(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_LOAD64(SRC_PTR, KIND) __atomic_load_8(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) __atomic_store_4(DST_PTR, (unsigned int)(VALUE), KIND)
# define LIBXSMM_ATOMIC_STORE8(DST_PTR, VALUE, KIND) __atomic_store_1(DST_PTR, (unsigned char)(VALUE), KIND)
# define LIBXSMM_ATOMIC_STORE16(DST_PTR, VALUE, KIND) __atomic_store_2(DST_PTR, (unsigned short)(VALUE), KIND)
# define LIBXSMM_ATOMIC_STORE64(DST_PTR, VALUE, KIND) __atomic_store_8(DST_PTR, (unsigned long long)(VALUE), KIND)
# endif
# define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) __atomic_fetch_or_4(DST_PTR, (unsigned int)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_OR8(DST_PTR, VALUE, KIND) __atomic_fetch_or_1(DST_PTR, (unsigned char)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_OR16(DST_PTR, VALUE, KIND) __atomic_fetch_or_2(DST_PTR, (unsigned short)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_OR64(DST_PTR, VALUE, KIND) __atomic_fetch_or_8(DST_PTR, (unsigned long long)(VALUE), KIND)
# define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) __atomic_add_fetch_4(DST_PTR, (int)(VALUE), KIND)
# define LIBXSMM_ATOMIC_ADD_FETCH8(DST_PTR, VALUE, KIND) __atomic_add_fetch_1(DST_PTR, (signed char)(VALUE), KIND)
# define LIBXSMM_ATOMIC_ADD_FETCH16(DST_PTR, VALUE, KIND) __atomic_add_fetch_2(DST_PTR, (short)(VALUE), KIND)
# define LIBXSMM_ATOMIC_ADD_FETCH64(DST_PTR, VALUE, KIND) __atomic_add_fetch_8(DST_PTR, (long long)(VALUE), KIND)
# define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) __atomic_sub_fetch_4(DST_PTR, (int)(VALUE), KIND)
# define LIBXSMM_ATOMIC_SUB_FETCH8(DST_PTR, VALUE, KIND) __atomic_sub_fetch_1(DST_PTR, (signed char)(VALUE), KIND)
# define LIBXSMM_ATOMIC_SUB_FETCH16(DST_PTR, VALUE, KIND) __atomic_sub_fetch_2(DST_PTR, (short)(VALUE), KIND)
# define LIBXSMM_ATOMIC_SUB_FETCH64(DST_PTR, VALUE, KIND) __atomic_sub_fetch_8(DST_PTR, (long long)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) __atomic_fetch_add_4(DST_PTR, (int)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_ADD8(DST_PTR, VALUE, KIND) __atomic_fetch_add_1(DST_PTR, (signed char)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, VALUE, KIND) __atomic_fetch_add_2(DST_PTR, (short)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, VALUE, KIND) __atomic_fetch_add_8(DST_PTR, (long long)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) __atomic_fetch_sub_4(DST_PTR, (int)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_SUB8(DST_PTR, VALUE, KIND) __atomic_fetch_sub_1(DST_PTR, (signed char)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_SUB16(DST_PTR, VALUE, KIND) __atomic_fetch_sub_2(DST_PTR, (short)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_SUB64(DST_PTR, VALUE, KIND) __atomic_fetch_sub_8(DST_PTR, (long long)(VALUE), KIND)
# define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) \
__atomic_compare_exchange_4(DST_PTR, &(OLDVAL), (NEWVAL), 0/*false*/, KIND, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_ATOMIC_CMPSWP8(DST_PTR, OLDVAL, NEWVAL, KIND) \
__atomic_compare_exchange_1(DST_PTR, &(OLDVAL), (NEWVAL), 0/*false*/, KIND, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_ATOMIC_CMPSWP16(DST_PTR, OLDVAL, NEWVAL, KIND) \
__atomic_compare_exchange_2(DST_PTR, &(OLDVAL), (NEWVAL), 0/*false*/, KIND, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_ATOMIC_CMPSWP64(DST_PTR, OLDVAL, NEWVAL, KIND) \
__atomic_compare_exchange_8(DST_PTR, &(OLDVAL), (NEWVAL), 0/*false*/, KIND, LIBXSMM_ATOMIC_RELAXED)
# if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP)
# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (!__atomic_test_and_set(DST_PTR, KIND))
# endif
# if defined(__PGI)
# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \
LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND); } /* matches bit-width of LIBXSMM_ATOMIC_LOCKTYPE */
# else
# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \
__atomic_clear(DST_PTR, KIND); }
# endif
# define LIBXSMM_ATOMIC_SYNC(KIND) __sync_synchronize()
# if !defined(LIBXSMM_ATOMIC_ZERO_STORE)
# define LIBXSMM_ATOMIC_ZERO_STORE
# endif
# elif defined(LIBXSMM_GCC_BASELINE)
# define LIBXSMM_ATOMIC(FN, BITS) FN
# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) __atomic_load_n(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) __atomic_store_n(DST_PTR, VALUE, KIND)
# if !defined(LIBXSMM_ATOMIC_ZERO_STORE)
# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) do {} while (__atomic_and_fetch(DST_PTR, 0, KIND))
# endif
# define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) __atomic_fetch_or(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) __atomic_add_fetch(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) __atomic_sub_fetch(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) __atomic_fetch_add(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) __atomic_fetch_sub(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) __sync_bool_compare_and_swap(DST_PTR, OLDVAL, NEWVAL)
# if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP)
# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (!__atomic_test_and_set(DST_PTR, KIND))
# endif
# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \
__atomic_clear(DST_PTR, KIND); }
# if 0 /* __atomic_thread_fence: incorrect behavior in libxsmm_barrier (even with LIBXSMM_ATOMIC_SEQ_CST) */
# define LIBXSMM_ATOMIC_SYNC(KIND) __atomic_thread_fence(KIND)
# else
# define LIBXSMM_ATOMIC_SYNC(KIND) __sync_synchronize()
# endif
# else /* GCC legacy atomics */
# define LIBXSMM_ATOMIC(FN, BITS) FN
# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) __sync_or_and_fetch(SRC_PTR, 0)
# if (LIBXSMM_X86_GENERIC <= LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) { \
__asm__ __volatile__("" ::: "memory"); *(DST_PTR) = (VALUE); \
__asm__ __volatile__("" ::: "memory"); }
# else
# define LIBXSMM_ATOMIC_SYNC_NOFENCE(KIND)
# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) *(DST_PTR) = (VALUE)
# endif
# if !defined(LIBXSMM_ATOMIC_ZERO_STORE)
# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) do {} while (__sync_and_and_fetch(DST_PTR, 0))
# endif
# define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) __sync_fetch_and_or(DST_PTR, VALUE)
# define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) __sync_add_and_fetch(DST_PTR, VALUE)
# define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) __sync_sub_and_fetch(DST_PTR, VALUE)
# define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) __sync_fetch_and_add(DST_PTR, VALUE)
# define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) __sync_fetch_and_sub(DST_PTR, VALUE)
# define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) __sync_bool_compare_and_swap(DST_PTR, OLDVAL, NEWVAL)
# if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP)
# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (0 == __sync_lock_test_and_set(DST_PTR, 1))
# endif
# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \
__sync_lock_release(DST_PTR); }
# define LIBXSMM_ATOMIC_SYNC(KIND) __sync_synchronize()
# endif
# if defined(LIBXSMM_ATOMIC_ZERO_STORE)
# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE(DST_PTR, 0, KIND)
# define LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, 8)(DST_PTR, 0, KIND)
# define LIBXSMM_ATOMIC_STORE_ZERO16(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, 16)(DST_PTR, 0, KIND)
# define LIBXSMM_ATOMIC_STORE_ZERO64(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, 64)(DST_PTR, 0, KIND)
# endif
# if !defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP)
# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) /* matches bit-width of LIBXSMM_ATOMIC_LOCKTYPE */ \
(0 == LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_OR, 8)(DST_PTR, 1, KIND))
# endif
# define LIBXSMM_ATOMIC_ACQUIRE(DST_PTR, NPAUSE, KIND) \
LIBXSMM_ASSERT(0 == LIBXSMM_MOD2((uintptr_t)(DST_PTR), 4)); \
while (!LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND)) LIBXSMM_SYNC_CYCLE(DST_PTR, 0/*free*/, NPAUSE); \
LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_ACQUIRE")
# if !defined(LIBXSMM_SYNC_NPAUSE)
# define LIBXSMM_SYNC_NPAUSE 4096
# endif
#elif defined(_WIN32)
# define LIBXSMM_ATOMIC(FN, BITS) LIBXSMM_CONCATENATE(LIBXSMM_ATOMIC, BITS)(FN)
# define LIBXSMM_ATOMIC8(FN) LIBXSMM_CONCATENATE(FN, 8)
# define LIBXSMM_ATOMIC16(FN) LIBXSMM_CONCATENATE(FN, 16)
# define LIBXSMM_ATOMIC32(FN) FN/*default*/
# define LIBXSMM_ATOMIC64(FN) LIBXSMM_CONCATENATE(FN, 64)
# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) InterlockedOr((volatile LONG*)(SRC_PTR), 0)
# define LIBXSMM_ATOMIC_LOAD8(SRC_PTR, KIND) _InterlockedOr8((volatile char*)(SRC_PTR), 0)
# define LIBXSMM_ATOMIC_LOAD64(SRC_PTR, KIND) InterlockedOr64((volatile LONGLONG*)(SRC_PTR), 0)
# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) InterlockedExchange((volatile LONG*)(DST_PTR), (LONG)(VALUE))
# define LIBXSMM_ATOMIC_STORE8(DST_PTR, VALUE, KIND) InterlockedExchange8((volatile char*)(DST_PTR), (LONGLONG)(VALUE))
# define LIBXSMM_ATOMIC_STORE64(DST_PTR, VALUE, KIND) InterlockedExchange64((volatile LONGLONG*)(DST_PTR), (LONGLONG)(VALUE))
# if defined(LIBXSMM_ATOMIC_ZERO_STORE)
# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE(DST_PTR, 0, KIND)
# define LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE8(DST_PTR, 0, KIND)
# define LIBXSMM_ATOMIC_STORE_ZERO64(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE64(DST_PTR, 0, KIND)
# else
# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) InterlockedAnd((volatile LONG*)(DST_PTR), 0)
# define LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND) InterlockedAnd8((volatile char*)(DST_PTR), 0)
# define LIBXSMM_ATOMIC_STORE_ZERO64(DST_PTR, KIND) InterlockedAnd64((volatile LONGLONG*)(DST_PTR), 0)
# endif
# define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) InterlockedOr((volatile LONG*)(DST_PTR), VALUE)
# define LIBXSMM_ATOMIC_FETCH_OR8(DST_PTR, VALUE, KIND) _InterlockedOr8((volatile char*)(DST_PTR), VALUE)
# define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) + (VALUE))
# define LIBXSMM_ATOMIC_ADD_FETCH16(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, VALUE, KIND) + (VALUE))
# define LIBXSMM_ATOMIC_ADD_FETCH64(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, VALUE, KIND) + (VALUE))
# define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) ((size_t)LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) - ((size_t)VALUE))
# define LIBXSMM_ATOMIC_SUB_FETCH16(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_SUB16(DST_PTR, VALUE, KIND) - (VALUE))
# define LIBXSMM_ATOMIC_SUB_FETCH64(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_SUB64(DST_PTR, VALUE, KIND) - (VALUE))
# define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) InterlockedExchangeAdd((volatile LONG*)(DST_PTR), VALUE)
# define LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, VALUE, KIND) _InterlockedExchangeAdd16((volatile SHORT*)(DST_PTR), VALUE)
# define LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, VALUE, KIND) InterlockedExchangeAdd64((volatile LONGLONG*)(DST_PTR), VALUE)
# define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, -1 * (VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_SUB16(DST_PTR, VALUE, KIND) LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, -1 * (VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_SUB64(DST_PTR, VALUE, KIND) LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, -1 * (VALUE), KIND)
# define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) (((LONG)(OLDVAL)) == InterlockedCompareExchange((volatile LONG*)(DST_PTR), NEWVAL, OLDVAL))
# define LIBXSMM_ATOMIC_CMPSWP8(DST_PTR, OLDVAL, NEWVAL, KIND) ((OLDVAL) == _InterlockedCompareExchange8((volatile char*)(DST_PTR), NEWVAL, OLDVAL))
# if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP)
# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_CMPSWP, 8)(DST_PTR, 0, 1, KIND)
# else
# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (0 == LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_OR, 8)(DST_PTR, 1, KIND))
# endif
# define LIBXSMM_ATOMIC_ACQUIRE(DST_PTR, NPAUSE, KIND) \
LIBXSMM_ASSERT(0 == LIBXSMM_MOD2((uintptr_t)(DST_PTR), 4)); \
while (!LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND)) LIBXSMM_SYNC_CYCLE(DST_PTR, 0/*free*/, NPAUSE); \
LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_ACQUIRE")
# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { \
LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \
LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE_ZERO, 8)(DST_PTR, KIND); }
# define LIBXSMM_ATOMIC_SYNC(KIND) _ReadWriteBarrier()
# if !defined(LIBXSMM_SYNC_NPAUSE)
# define LIBXSMM_SYNC_NPAUSE 4096
# endif
#else /* consider to permit LIBXSMM_SYNC_NONE */
# error LIBXSMM is missing atomic compiler builtins!
#endif
#if !defined(LIBXSMM_SYNC_CYCLE)
# if (0 < LIBXSMM_SYNC_NPAUSE)
# define LIBXSMM_SYNC_CYCLE_ELSE(DST_PTR, EXP_STATE, NPAUSE, ELSE) do { int libxsmm_sync_cycle_npause_ = 1; \
do { int libxsmm_sync_cycle_counter_ = 0; \
for (; libxsmm_sync_cycle_counter_ < libxsmm_sync_cycle_npause_; ++libxsmm_sync_cycle_counter_) LIBXSMM_SYNC_PAUSE; \
if (libxsmm_sync_cycle_npause_ < (NPAUSE)) { \
libxsmm_sync_cycle_npause_ *= 2; \
} \
else { \
libxsmm_sync_cycle_npause_ = (NPAUSE); \
LIBXSMM_SYNC_YIELD; \
ELSE \
} \
} while(((EXP_STATE) & 1) != (*(DST_PTR) & 1)); \
} while(0)
# else
# define LIBXSMM_SYNC_CYCLE_ELSE(DST_PTR, EXP_STATE, NPAUSE, ELSE) LIBXSMM_SYNC_PAUSE
# endif
# define LIBXSMM_SYNC_CYCLE(DST_PTR, EXP_STATE, NPAUSE) \
LIBXSMM_SYNC_CYCLE_ELSE(DST_PTR, EXP_STATE, NPAUSE, /*else*/;)
#endif
#if (0 != LIBXSMM_SYNC)
# define LIBXSMM_LOCK_DEFAULT LIBXSMM_LOCK_SPINLOCK
# if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) && !(defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP)) && \
(!defined(__linux__) || defined(__USE_XOPEN2K)) && 0/*disabled*/
# define LIBXSMM_LOCK_SYSTEM_SPINLOCK
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX) && !(defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP))
# define LIBXSMM_LOCK_SYSTEM_MUTEX
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) && !(defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP)) && \
(!defined(__linux__) || defined(__USE_XOPEN2K) || defined(__USE_UNIX98))
# define LIBXSMM_LOCK_SYSTEM_RWLOCK
# endif
/* Lock type, initialization, destruction, (try-)lock, unlock, etc */
# define LIBXSMM_LOCK_ACQUIRED(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ACQUIRED_, KIND)
# define LIBXSMM_LOCK_TYPE_ISPOD(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TYPE_ISPOD_, KIND)
# define LIBXSMM_LOCK_TYPE_ISRW(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TYPE_ISRW_, KIND)
# define LIBXSMM_LOCK_TYPE(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TYPE_, KIND)
# define LIBXSMM_LOCK_INIT(KIND, LOCK, ATTR) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_INIT_, KIND)(LOCK, ATTR)
# define LIBXSMM_LOCK_DESTROY(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_DESTROY_, KIND)(LOCK)
# define LIBXSMM_LOCK_TRYLOCK(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TRYLOCK_, KIND)(LOCK)
# define LIBXSMM_LOCK_ACQUIRE(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ACQUIRE_, KIND)(LOCK)
# define LIBXSMM_LOCK_RELEASE(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_RELEASE_, KIND)(LOCK)
# define LIBXSMM_LOCK_TRYREAD(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TRYREAD_, KIND)(LOCK)
# define LIBXSMM_LOCK_ACQREAD(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ACQREAD_, KIND)(LOCK)
# define LIBXSMM_LOCK_RELREAD(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_RELREAD_, KIND)(LOCK)
/* Attribute type, initialization, destruction */
# define LIBXSMM_LOCK_ATTR_TYPE(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ATTR_TYPE_, KIND)
# define LIBXSMM_LOCK_ATTR_INIT(KIND, ATTR) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ATTR_INIT_, KIND)(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY(KIND, ATTR) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ATTR_DESTROY_, KIND)(ATTR)
/* Cygwin's Pthread implementation appears to be broken; use Win32 */
# if !defined(LIBXSMM_WIN32_THREADS) && (defined(_WIN32) || defined(__CYGWIN__))
# define LIBXSMM_WIN32_THREADS _WIN32_WINNT
# if defined(__CYGWIN__) || defined(__MINGW32__) /* hack: make SRW-locks available */
# if defined(_WIN32_WINNT)
# undef _WIN32_WINNT
# if !defined(NTDDI_VERSION)
# define NTDDI_VERSION 0x0600
# endif
# define _WIN32_WINNT ((LIBXSMM_WIN32_THREADS) | 0x0600)
# else
# define _WIN32_WINNT 0x0600
# endif
# endif
# endif
# if defined(LIBXSMM_WIN32_THREADS)
# define LIBXSMM_TLS_TYPE DWORD
# define LIBXSMM_TLS_CREATE(KEYPTR) *(KEYPTR) = TlsAlloc()
# define LIBXSMM_TLS_DESTROY(KEY) TlsFree(KEY)
# define LIBXSMM_TLS_SETVALUE(KEY, PTR) TlsSetValue(KEY, PTR)
# define LIBXSMM_TLS_GETVALUE(KEY) TlsGetValue(KEY)
# define LIBXSMM_LOCK_SPINLOCK spin
# if ((LIBXSMM_WIN32_THREADS) & 0x0600)
# define LIBXSMM_LOCK_MUTEX rwlock
# define LIBXSMM_LOCK_RWLOCK rwlock
# else /* mutex exposes high latency */
# define LIBXSMM_LOCK_MUTEX mutex
# define LIBXSMM_LOCK_RWLOCK mutex
# endif
# if defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK)
# define LIBXSMM_LOCK_ACQUIRED_spin TRUE
# define LIBXSMM_LOCK_TYPE_ISPOD_spin 0
# define LIBXSMM_LOCK_TYPE_spin CRITICAL_SECTION
# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); InitializeCriticalSection(LOCK); }
# define LIBXSMM_LOCK_DESTROY_spin(LOCK) DeleteCriticalSection((LIBXSMM_LOCK_TYPE_spin*)(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) TryEnterCriticalSection(LOCK)
# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) EnterCriticalSection(LOCK)
# define LIBXSMM_LOCK_RELEASE_spin(LOCK) LeaveCriticalSection(LOCK)
# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK)
# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK)
# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_spin int
# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if defined(LIBXSMM_LOCK_SYSTEM_MUTEX)
# define LIBXSMM_LOCK_ACQUIRED_mutex WAIT_OBJECT_0
# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0
# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0
# define LIBXSMM_LOCK_TYPE_mutex HANDLE
# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) (*(LOCK) = CreateMutex(*(ATTR), FALSE, NULL))
# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) CloseHandle(*(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) WaitForSingleObject(*(LOCK), 0)
# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) WaitForSingleObject(*(LOCK), INFINITE)
# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) ReleaseMutex(*(LOCK))
# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK)
# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK)
# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_mutex LPSECURITY_ATTRIBUTES
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (*(ATTR) = NULL)
# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if defined(LIBXSMM_LOCK_SYSTEM_RWLOCK)
# define LIBXSMM_LOCK_ACQUIRED_rwlock TRUE
# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 1
# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 1
# define LIBXSMM_LOCK_TYPE_rwlock SRWLOCK
# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); InitializeSRWLock(LOCK); }
# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) LIBXSMM_UNUSED(LOCK)
# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) TryAcquireSRWLockExclusive(LOCK)
# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) AcquireSRWLockExclusive(LOCK)
# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) ReleaseSRWLockExclusive(LOCK)
# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) TryAcquireSRWLockShared(LOCK)
# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) AcquireSRWLockShared(LOCK)
# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) ReleaseSRWLockShared(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_rwlock int
# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# define LIBXSMM_SYNC_YIELD YieldProcessor()
# else
# define LIBXSMM_TLS_TYPE pthread_key_t
# define LIBXSMM_TLS_CREATE(KEYPTR) pthread_key_create(KEYPTR, NULL)
# define LIBXSMM_TLS_DESTROY(KEY) pthread_key_delete(KEY)
# define LIBXSMM_TLS_SETVALUE(KEY, PTR) pthread_setspecific(KEY, PTR)
# define LIBXSMM_TLS_GETVALUE(KEY) pthread_getspecific(KEY)
# if defined(__APPLE__) && defined(__MACH__)
# define LIBXSMM_SYNC_YIELD pthread_yield_np()
# else
# if defined(__USE_GNU) || !defined(__BSD_VISIBLE)
LIBXSMM_EXTERN int pthread_yield(void) LIBXSMM_THROW;
# else
LIBXSMM_EXTERN void pthread_yield(void);
# endif
# define LIBXSMM_SYNC_YIELD pthread_yield()
# endif
# if defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) && defined(__APPLE__) && defined(__MACH__)
# define LIBXSMM_LOCK_SPINLOCK mutex
# else
# define LIBXSMM_LOCK_SPINLOCK spin
# endif
# define LIBXSMM_LOCK_MUTEX mutex
# define LIBXSMM_LOCK_RWLOCK rwlock
# if defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK)
# define LIBXSMM_LOCK_ACQUIRED_spin 0
# define LIBXSMM_LOCK_TYPE_ISPOD_spin 0
# define LIBXSMM_LOCK_TYPE_ISRW_spin 0
# define LIBXSMM_LOCK_TYPE_spin pthread_spinlock_t
# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) LIBXSMM_EXPECT(0, pthread_spin_init(LOCK, *(ATTR)))
# define LIBXSMM_LOCK_DESTROY_spin(LOCK) LIBXSMM_EXPECT(0, pthread_spin_destroy(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) pthread_spin_trylock(LOCK)
# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) LIBXSMM_EXPECT(0, pthread_spin_lock(LOCK))
# define LIBXSMM_LOCK_RELEASE_spin(LOCK) LIBXSMM_EXPECT(0, pthread_spin_unlock(LOCK))
# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK)
# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK)
# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_spin int
# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) (*(ATTR) = 0)
# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if defined(LIBXSMM_LOCK_SYSTEM_MUTEX)
# define LIBXSMM_LOCK_ACQUIRED_mutex 0
# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0
# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0
# define LIBXSMM_LOCK_TYPE_mutex pthread_mutex_t
# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) LIBXSMM_EXPECT(0, pthread_mutex_init(LOCK, ATTR))
# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) LIBXSMM_EXPECT_DEBUG(0, pthread_mutex_destroy(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) pthread_mutex_trylock(LOCK) /*!LIBXSMM_EXPECT*/
# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) LIBXSMM_EXPECT(0, pthread_mutex_lock(LOCK))
# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) LIBXSMM_EXPECT(0, pthread_mutex_unlock(LOCK))
# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK)
# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK)
# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_mutex pthread_mutexattr_t
#if !defined(__linux__) || defined(__USE_UNIX98) || defined(__USE_XOPEN2K8)
# if defined(_DEBUG)
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (LIBXSMM_EXPECT(0, pthread_mutexattr_init(ATTR)), \
LIBXSMM_EXPECT(0, pthread_mutexattr_settype(ATTR, PTHREAD_MUTEX_ERRORCHECK)))
# else
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (pthread_mutexattr_init(ATTR), \
pthread_mutexattr_settype(ATTR, PTHREAD_MUTEX_NORMAL))
# endif
#else
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) pthread_mutexattr_init(ATTR)
#endif
# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_EXPECT(0, pthread_mutexattr_destroy(ATTR))
# endif
# if defined(LIBXSMM_LOCK_SYSTEM_RWLOCK)
# define LIBXSMM_LOCK_ACQUIRED_rwlock 0
# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 0
# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 1
# define LIBXSMM_LOCK_TYPE_rwlock pthread_rwlock_t
# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) LIBXSMM_EXPECT(0, pthread_rwlock_init(LOCK, ATTR))
# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_destroy(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) pthread_rwlock_trywrlock(LOCK)
# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_wrlock(LOCK))
# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_unlock(LOCK))
# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) pthread_rwlock_tryrdlock(LOCK)
# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_rdlock(LOCK))
# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) LIBXSMM_LOCK_RELEASE_rwlock(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_rwlock pthread_rwlockattr_t
# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_EXPECT(0, pthread_rwlockattr_init(ATTR))
# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_EXPECT(0, pthread_rwlockattr_destroy(ATTR))
# endif
# endif
/* OpenMP based locks need to stay disabled unless both
* libxsmm and libxsmmext are built with OpenMP support.
*/
# if defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP)
# if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK)
# define LIBXSMM_LOCK_ACQUIRED_spin 1
# define LIBXSMM_LOCK_TYPE_ISPOD_spin 0
# define LIBXSMM_LOCK_TYPE_ISRW_spin 0
# define LIBXSMM_LOCK_TYPE_spin omp_lock_t
# define LIBXSMM_LOCK_DESTROY_spin(LOCK) omp_destroy_lock(LOCK)
# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) omp_test_lock(LOCK)
# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) omp_set_lock(LOCK)
# define LIBXSMM_LOCK_RELEASE_spin(LOCK) omp_unset_lock(LOCK)
# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK)
# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK)
# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK)
# if (201811 <= _OPENMP/*v5.0*/)
# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) omp_init_lock_with_hint(LOCK, *(ATTR))
# define LIBXSMM_LOCK_ATTR_TYPE_spin omp_lock_hint_t
# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) (*(ATTR) = omp_lock_hint_none)
# else
# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); omp_init_lock(LOCK); }
# define LIBXSMM_LOCK_ATTR_TYPE_spin const void*
# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX)
# define LIBXSMM_LOCK_ACQUIRED_mutex 1
# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0
# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0
# define LIBXSMM_LOCK_TYPE_mutex omp_lock_t
# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) omp_destroy_lock(LOCK)
# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) omp_test_lock(LOCK)
# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) omp_set_lock(LOCK)
# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) omp_unset_lock(LOCK)
# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK)
# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK)
# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK)
# if (201811 <= _OPENMP/*v5.0*/)
# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) omp_init_lock_with_hint(LOCK, *(ATTR))
# define LIBXSMM_LOCK_ATTR_TYPE_mutex omp_lock_hint_t
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (*(ATTR) = omp_lock_hint_none)
# else
# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); omp_init_lock(LOCK); }
# define LIBXSMM_LOCK_ATTR_TYPE_mutex const void*
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK)
# define LIBXSMM_LOCK_ACQUIRED_rwlock 1
# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 0
# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 0
# define LIBXSMM_LOCK_TYPE_rwlock omp_lock_t
# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) omp_destroy_lock(LOCK)
# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) omp_test_lock(LOCK)
# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) omp_set_lock(LOCK)
# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) omp_unset_lock(LOCK)
# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK)
# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK)
# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) LIBXSMM_LOCK_RELEASE_rwlock(LOCK)
# if (201811 <= _OPENMP/*v5.0*/)
# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) omp_init_lock_with_hint(LOCK, *(ATTR))
# define LIBXSMM_LOCK_ATTR_TYPE_rwlock omp_lock_hint_t
# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) (*(ATTR) = omp_lock_hint_none)
# else
# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); omp_init_lock(LOCK); }
# define LIBXSMM_LOCK_ATTR_TYPE_rwlock const void*
# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# elif !defined(LIBXSMM_SYNC_NONE) /* based on atomic primitives */
# if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK)
# define LIBXSMM_LOCK_ACQUIRED_spin 0
# define LIBXSMM_LOCK_TYPE_ISPOD_spin 1
# define LIBXSMM_LOCK_TYPE_ISRW_spin 0
# define LIBXSMM_LOCK_TYPE_spin volatile LIBXSMM_ATOMIC_LOCKTYPE
# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = 0); }
# define LIBXSMM_LOCK_DESTROY_spin(LOCK) LIBXSMM_UNUSED(LOCK)
# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) (LIBXSMM_LOCK_ACQUIRED_spin + !LIBXSMM_ATOMIC_TRYLOCK(LOCK, LIBXSMM_ATOMIC_RELAXED))
# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) LIBXSMM_ATOMIC_ACQUIRE(LOCK, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_LOCK_RELEASE_spin(LOCK) LIBXSMM_ATOMIC_RELEASE(LOCK, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK)
# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK)
# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_spin int
# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX)
# define LIBXSMM_LOCK_ACQUIRED_mutex 0
# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 1
# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0
# define LIBXSMM_LOCK_TYPE_mutex volatile LIBXSMM_ATOMIC_LOCKTYPE
# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = 0); }
# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) LIBXSMM_UNUSED(LOCK)
# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) (LIBXSMM_LOCK_ACQUIRED_mutex + !LIBXSMM_ATOMIC_TRYLOCK(LOCK, LIBXSMM_ATOMIC_RELAXED))
# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) LIBXSMM_ATOMIC_ACQUIRE(LOCK, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) LIBXSMM_ATOMIC_RELEASE(LOCK, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK)
# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK)
# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_mutex int
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK)
# define LIBXSMM_LOCK_ACQUIRED_rwlock 0
# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 1
# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 0
# define LIBXSMM_LOCK_TYPE_rwlock volatile LIBXSMM_ATOMIC_LOCKTYPE
# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = 0); }
# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) LIBXSMM_UNUSED(LOCK)
# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) (LIBXSMM_LOCK_ACQUIRED_rwlock + !LIBXSMM_ATOMIC_TRYLOCK(LOCK, LIBXSMM_ATOMIC_RELAXED))
# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) LIBXSMM_ATOMIC_ACQUIRE(LOCK, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) LIBXSMM_ATOMIC_RELEASE(LOCK, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK)
# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK)
# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) LIBXSMM_LOCK_RELEASE_rwlock(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_rwlock int
# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# else /* experimental */
# if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK)
# define LIBXSMM_LOCK_ACQUIRED_spin 0
# define LIBXSMM_LOCK_TYPE_ISPOD_spin 0
# define LIBXSMM_LOCK_TYPE_ISRW_spin 0
# define LIBXSMM_LOCK_TYPE_spin libxsmm_spinlock*
# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = libxsmm_spinlock_create()); }
# define LIBXSMM_LOCK_DESTROY_spin(LOCK) libxsmm_spinlock_destroy(*(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) libxsmm_spinlock_trylock(*(LOCK))
# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) libxsmm_spinlock_acquire(*(LOCK))
# define LIBXSMM_LOCK_RELEASE_spin(LOCK) libxsmm_spinlock_release(*(LOCK))
# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK)
# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK)
# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_spin int
# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX)
# define LIBXSMM_LOCK_ACQUIRED_mutex 0
# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0
# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0
# define LIBXSMM_LOCK_TYPE_mutex libxsmm_mutex*
# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = libxsmm_mutex_create()); }
# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) libxsmm_mutex_destroy(*(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) libxsmm_mutex_trylock(*(LOCK))
# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) libxsmm_mutex_acquire(*(LOCK))
# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) libxsmm_mutex_release(*(LOCK))
# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK)
# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK)
# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_mutex int
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK)
# define LIBXSMM_LOCK_ACQUIRED_rwlock 0
# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 0
# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 1
# define LIBXSMM_LOCK_TYPE_rwlock libxsmm_rwlock*
# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = libxsmm_rwlock_create()); }
# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) libxsmm_rwlock_destroy(*(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) libxsmm_rwlock_trylock(*(LOCK))
# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) libxsmm_rwlock_acquire(*(LOCK))
# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) libxsmm_rwlock_release(*(LOCK))
# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) libxsmm_rwlock_tryread(*(LOCK))
# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) libxsmm_rwlock_acqread(*(LOCK))
# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) libxsmm_rwlock_relread(*(LOCK))
# define LIBXSMM_LOCK_ATTR_TYPE_rwlock int
# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# endif
#else /* no synchronization */
# define LIBXSMM_SYNC_YIELD LIBXSMM_SYNC_PAUSE
# define LIBXSMM_LOCK_SPINLOCK spinlock_dummy
# define LIBXSMM_LOCK_MUTEX mutex_dummy
# define LIBXSMM_LOCK_RWLOCK rwlock_dummy
# define LIBXSMM_LOCK_ACQUIRED(KIND) 0
# define LIBXSMM_LOCK_TYPE_ISPOD(KIND) 1
# define LIBXSMM_LOCK_TYPE_ISRW(KIND) 0
# define LIBXSMM_LOCK_ATTR_TYPE(KIND) int
# define LIBXSMM_LOCK_ATTR_INIT(KIND, ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY(KIND, ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_TYPE(KIND) int
# define LIBXSMM_LOCK_INIT(KIND, LOCK, ATTR) { LIBXSMM_UNUSED(LOCK); LIBXSMM_UNUSED(ATTR); }
# define LIBXSMM_LOCK_DESTROY(KIND, LOCK) LIBXSMM_UNUSED(LOCK)
# define LIBXSMM_LOCK_TRYLOCK(KIND, LOCK) LIBXSMM_LOCK_ACQUIRED(KIND)
# define LIBXSMM_LOCK_ACQUIRE(KIND, LOCK) LIBXSMM_UNUSED(LOCK)
# define LIBXSMM_LOCK_RELEASE(KIND, LOCK) LIBXSMM_UNUSED(LOCK)
# define LIBXSMM_LOCK_TRYREAD(KIND, LOCK) LIBXSMM_LOCK_TRYLOCK(KIND, LOCK)
# define LIBXSMM_LOCK_ACQREAD(KIND, LOCK) LIBXSMM_LOCK_ACQUIRE(KIND, LOCK)
# define LIBXSMM_LOCK_RELREAD(KIND, LOCK) LIBXSMM_LOCK_RELEASE(KIND, LOCK)
#endif
#if (0 == LIBXSMM_SYNC)
# define LIBXSMM_FLOCK(FILE)
# define LIBXSMM_FUNLOCK(FILE)
#elif defined(_WIN32)
# define LIBXSMM_FLOCK(FILE) _lock_file(FILE)
# define LIBXSMM_FUNLOCK(FILE) _unlock_file(FILE)
#else
# if !defined(__CYGWIN__)
# define LIBXSMM_FLOCK(FILE) flockfile(FILE)
# define LIBXSMM_FUNLOCK(FILE) funlockfile(FILE)
LIBXSMM_EXTERN void flockfile(FILE*) LIBXSMM_THROW;
LIBXSMM_EXTERN void funlockfile(FILE*) LIBXSMM_THROW;
# else /* Only available with __CYGWIN__ *and* C++0x. */
# define LIBXSMM_FLOCK(FILE)
# define LIBXSMM_FUNLOCK(FILE)
# endif
#endif
/** Synchronize console output */
#define LIBXSMM_STDIO_ACQUIRE() LIBXSMM_FLOCK(stdout); LIBXSMM_FLOCK(stderr)
#define LIBXSMM_STDIO_RELEASE() LIBXSMM_FUNLOCK(stderr); LIBXSMM_FUNLOCK(stdout)
/** Opaque type which represents a barrier. */
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_barrier libxsmm_barrier;
/** Create barrier from one of the threads. */
LIBXSMM_API libxsmm_barrier* libxsmm_barrier_create(int ncores, int nthreads_per_core);
/** Initialize the barrier from each thread of the team. */
LIBXSMM_API void libxsmm_barrier_init(libxsmm_barrier* barrier, int tid);
/** Wait for the entire team to arrive. */
LIBXSMM_API void libxsmm_barrier_wait(libxsmm_barrier* barrier, int tid);
/** Destroy the resources associated with this barrier. */
LIBXSMM_API void libxsmm_barrier_destroy(const libxsmm_barrier* barrier);
/** DEPRECATED: use libxsmm_barrier_destroy instead. */
#define libxsmm_barrier_release libxsmm_barrier_destroy
/** Spin-lock, which eventually differs from LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_SPINLOCK). */
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_spinlock libxsmm_spinlock;
LIBXSMM_API libxsmm_spinlock* libxsmm_spinlock_create(void);
LIBXSMM_API void libxsmm_spinlock_destroy(const libxsmm_spinlock* spinlock);
LIBXSMM_API int libxsmm_spinlock_trylock(libxsmm_spinlock* spinlock);
LIBXSMM_API void libxsmm_spinlock_acquire(libxsmm_spinlock* spinlock);
LIBXSMM_API void libxsmm_spinlock_release(libxsmm_spinlock* spinlock);
/** Mutual-exclusive lock (Mutex), which eventually differs from LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_MUTEX). */
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_mutex libxsmm_mutex;
LIBXSMM_API libxsmm_mutex* libxsmm_mutex_create(void);
LIBXSMM_API void libxsmm_mutex_destroy(const libxsmm_mutex* mutex);
LIBXSMM_API int libxsmm_mutex_trylock(libxsmm_mutex* mutex);
LIBXSMM_API void libxsmm_mutex_acquire(libxsmm_mutex* mutex);
LIBXSMM_API void libxsmm_mutex_release(libxsmm_mutex* mutex);
/** Reader-Writer lock (RW-lock), which eventually differs from LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_RWLOCK). */
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_rwlock libxsmm_rwlock;
LIBXSMM_API libxsmm_rwlock* libxsmm_rwlock_create(void);
LIBXSMM_API void libxsmm_rwlock_destroy(const libxsmm_rwlock* rwlock);
LIBXSMM_API int libxsmm_rwlock_trylock(libxsmm_rwlock* rwlock);
LIBXSMM_API void libxsmm_rwlock_acquire(libxsmm_rwlock* rwlock);
LIBXSMM_API void libxsmm_rwlock_release(libxsmm_rwlock* rwlock);
LIBXSMM_API int libxsmm_rwlock_tryread(libxsmm_rwlock* rwlock);
LIBXSMM_API void libxsmm_rwlock_acqread(libxsmm_rwlock* rwlock);
LIBXSMM_API void libxsmm_rwlock_relread(libxsmm_rwlock* rwlock);
/** Utility function to receive the process ID of the calling process. */
LIBXSMM_API unsigned int libxsmm_get_pid(void);
/**
* Utility function to receive a Thread-ID (TID) for the calling thread.
* The TID is not related to a specific threading runtime. TID=0 may not
* represent the main thread. TIDs are zero-based and consecutive numbers.
*/
LIBXSMM_API unsigned int libxsmm_get_tid(void);
#endif /*LIBXSMM_SYNC_H*/
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_TIMER_H
#define LIBXSMM_TIMER_H
#include "libxsmm_macros.h"
typedef unsigned long long libxsmm_timer_tickint;
LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_timer_info {
int tsc;
} libxsmm_timer_info;
/** Query timer properties. */
LIBXSMM_API int libxsmm_get_timer_info(libxsmm_timer_info* info);
/**
* Returns the current clock tick of a monotonic timer source with
* platform-specific resolution (not necessarily CPU cycles).
*/
LIBXSMM_API libxsmm_timer_tickint libxsmm_timer_tick(void);
/** Returns the difference between two timer ticks (cycles); avoids potential side-effects/assumptions of LIBXSMM_DIFF. */
LIBXSMM_API_INLINE libxsmm_timer_tickint libxsmm_timer_ncycles(libxsmm_timer_tickint tick0, libxsmm_timer_tickint tick1) {
return LIBXSMM_DELTA(tick0, tick1);
}
/** Returns the duration (in seconds) between two values received by libxsmm_timer_tick. */
LIBXSMM_API double libxsmm_timer_duration(libxsmm_timer_tickint tick0, libxsmm_timer_tickint tick1);
#endif /*LIBXSMM_TIMER_H*/
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment