Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
c454d419
Commit
c454d419
authored
May 12, 2023
by
lisj
Browse files
删除子模块的gitignore
parent
3359c1f1
Changes
264
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
5304 additions
and
0 deletions
+5304
-0
third_party/libxsmm/include/libxsmm_dnn_fusedgroupnorm.h
third_party/libxsmm/include/libxsmm_dnn_fusedgroupnorm.h
+39
-0
third_party/libxsmm/include/libxsmm_dnn_optimizer.h
third_party/libxsmm/include/libxsmm_dnn_optimizer.h
+55
-0
third_party/libxsmm/include/libxsmm_dnn_pooling.h
third_party/libxsmm/include/libxsmm_dnn_pooling.h
+65
-0
third_party/libxsmm/include/libxsmm_dnn_rnncell.h
third_party/libxsmm/include/libxsmm_dnn_rnncell.h
+79
-0
third_party/libxsmm/include/libxsmm_dnn_softmaxloss.h
third_party/libxsmm/include/libxsmm_dnn_softmaxloss.h
+51
-0
third_party/libxsmm/include/libxsmm_dnn_tensor.h
third_party/libxsmm/include/libxsmm_dnn_tensor.h
+199
-0
third_party/libxsmm/include/libxsmm_frontend.h
third_party/libxsmm/include/libxsmm_frontend.h
+590
-0
third_party/libxsmm/include/libxsmm_fsspmdm.h
third_party/libxsmm/include/libxsmm_fsspmdm.h
+40
-0
third_party/libxsmm/include/libxsmm_generator.h
third_party/libxsmm/include/libxsmm_generator.h
+219
-0
third_party/libxsmm/include/libxsmm_intrinsics_x86.h
third_party/libxsmm/include/libxsmm_intrinsics_x86.h
+1022
-0
third_party/libxsmm/include/libxsmm_macros.h
third_party/libxsmm/include/libxsmm_macros.h
+983
-0
third_party/libxsmm/include/libxsmm_malloc.h
third_party/libxsmm/include/libxsmm_malloc.h
+397
-0
third_party/libxsmm/include/libxsmm_math.h
third_party/libxsmm/include/libxsmm_math.h
+140
-0
third_party/libxsmm/include/libxsmm_memory.h
third_party/libxsmm/include/libxsmm_memory.h
+85
-0
third_party/libxsmm/include/libxsmm_mhd.h
third_party/libxsmm/include/libxsmm_mhd.h
+167
-0
third_party/libxsmm/include/libxsmm_rng.h
third_party/libxsmm/include/libxsmm_rng.h
+57
-0
third_party/libxsmm/include/libxsmm_source.h
third_party/libxsmm/include/libxsmm_source.h
+144
-0
third_party/libxsmm/include/libxsmm_spmdm.h
third_party/libxsmm/include/libxsmm_spmdm.h
+115
-0
third_party/libxsmm/include/libxsmm_sync.h
third_party/libxsmm/include/libxsmm_sync.h
+816
-0
third_party/libxsmm/include/libxsmm_timer.h
third_party/libxsmm/include/libxsmm_timer.h
+41
-0
No files found.
Too many changes to show.
To preserve performance only
264 of 264+
files are displayed.
Plain diff
Email patch
third_party/libxsmm/include/libxsmm_dnn_fusedgroupnorm.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_DNN_FUSEDGROUPNORM_H
#define LIBXSMM_DNN_FUSEDGROUPNORM_H
#include "libxsmm_dnn.h"
#include "libxsmm_dnn_tensor.h"
/** Opaque handles which represents LIBXSMM fusedgroupnorm */
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_dnn_fusedgroupnorm
libxsmm_dnn_fusedgroupnorm
;
LIBXSMM_API
libxsmm_dnn_fusedgroupnorm
*
libxsmm_dnn_create_fusedgroupnorm
(
libxsmm_dnn_fusedgroupnorm_desc
fusedgroupnorm_desc
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_destroy_fusedgroupnorm
(
const
libxsmm_dnn_fusedgroupnorm
*
handle
);
LIBXSMM_API
libxsmm_dnn_tensor_datalayout
*
libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout
(
const
libxsmm_dnn_fusedgroupnorm
*
handle
,
const
libxsmm_dnn_tensor_type
type
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
size_t
libxsmm_dnn_fusedgroupnorm_get_scratch_size
(
const
libxsmm_dnn_fusedgroupnorm
*
handle
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_fusedgroupnorm_bind_scratch
(
libxsmm_dnn_fusedgroupnorm
*
handle
,
const
void
*
scratch
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_fusedgroupnorm_release_scratch
(
libxsmm_dnn_fusedgroupnorm
*
handle
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_fusedgroupnorm_bind_tensor
(
libxsmm_dnn_fusedgroupnorm
*
handle
,
const
libxsmm_dnn_tensor
*
tensor
,
const
libxsmm_dnn_tensor_type
type
);
LIBXSMM_API
libxsmm_dnn_tensor
*
libxsmm_dnn_fusedgroupnorm_get_tensor
(
libxsmm_dnn_fusedgroupnorm
*
handle
,
const
libxsmm_dnn_tensor_type
type
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_fusedgroupnorm_release_tensor
(
libxsmm_dnn_fusedgroupnorm
*
handle
,
const
libxsmm_dnn_tensor_type
type
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_fusedgroupnorm_execute_st
(
libxsmm_dnn_fusedgroupnorm
*
handle
,
libxsmm_dnn_compute_kind
kind
,
/*unsigned*/
int
start_thread
,
/*unsigned*/
int
tid
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_fusedgroupnorm_reduce_stats_st
(
libxsmm_dnn_fusedgroupnorm
**
handles
,
int
num_handles
,
libxsmm_dnn_compute_kind
kind
,
/*unsigned*/
int
start_thread
,
/*unsigned*/
int
tid
);
#endif
/*LIBXSMM_DNN_FUSEDGROUPNORM_H*/
third_party/libxsmm/include/libxsmm_dnn_optimizer.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_DNN_SGD_H
#define LIBXSMM_DNN_SGD_H
#include "libxsmm_dnn.h"
#include "libxsmm_dnn_tensor.h"
/** Opaque handles which represents LIBXSMM optimizer */
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_dnn_optimizer
libxsmm_dnn_optimizer
;
typedef
enum
libxsmm_dnn_optimizer_type
{
LIBXSMM_DNN_OPTIMIZER_SGD
=
1
}
libxsmm_dnn_optimizer_type
;
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_dnn_optimizer_desc
{
int
C
;
/* number of feature maps */
int
K
;
/* number of feature maps */
int
bc
;
int
bk
;
float
learning_rate
;
/* learning rate */
int
threads
;
/* number of threads used */
libxsmm_dnn_optimizer_type
opt_type
;
libxsmm_dnn_datatype
datatype_master
;
/* datatype used for all input related buffers */
libxsmm_dnn_datatype
datatype
;
/* datatype used for all input related buffers */
libxsmm_dnn_tensor_format
filter_format
;
/* format which is for filter buffers */
}
libxsmm_dnn_optimizer_desc
;
LIBXSMM_API
libxsmm_dnn_optimizer
*
libxsmm_dnn_create_optimizer
(
libxsmm_dnn_optimizer_desc
optimizer_desc
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_destroy_optimizer
(
const
libxsmm_dnn_optimizer
*
handle
);
LIBXSMM_API
libxsmm_dnn_tensor_datalayout
*
libxsmm_dnn_optimizer_create_tensor_datalayout
(
const
libxsmm_dnn_optimizer
*
handle
,
const
libxsmm_dnn_tensor_type
type
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
void
*
libxsmm_dnn_optimizer_get_scratch_ptr
(
const
libxsmm_dnn_optimizer
*
handle
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
size_t
libxsmm_dnn_optimizer_get_scratch_size
(
const
libxsmm_dnn_optimizer
*
handle
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_optimizer_bind_scratch
(
libxsmm_dnn_optimizer
*
handle
,
const
void
*
scratch
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_optimizer_release_scratch
(
libxsmm_dnn_optimizer
*
handle
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_optimizer_bind_tensor
(
libxsmm_dnn_optimizer
*
handle
,
const
libxsmm_dnn_tensor
*
tensor
,
const
libxsmm_dnn_tensor_type
type
);
LIBXSMM_API
libxsmm_dnn_tensor
*
libxsmm_dnn_optimizer_get_tensor
(
libxsmm_dnn_optimizer
*
handle
,
const
libxsmm_dnn_tensor_type
type
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_optimizer_release_tensor
(
libxsmm_dnn_optimizer
*
handle
,
const
libxsmm_dnn_tensor_type
type
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_optimizer_execute_st
(
libxsmm_dnn_optimizer
*
handle
,
/*unsigned*/
int
start_thread
,
/*unsigned*/
int
tid
);
#endif
/*LIBXSMM_DNN_SGD_H*/
third_party/libxsmm/include/libxsmm_dnn_pooling.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_DNN_POOLING_H
#define LIBXSMM_DNN_POOLING_H
#include "libxsmm_dnn.h"
#include "libxsmm_dnn_tensor.h"
/** Opaque handles which represents LIBXSMM pooling */
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_dnn_pooling
libxsmm_dnn_pooling
;
typedef
enum
libxsmm_dnn_pooling_type
{
LIBXSMM_DNN_POOLING_MAX
=
1
,
LIBXSMM_DNN_POOLING_AVG
=
2
}
libxsmm_dnn_pooling_type
;
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_dnn_pooling_desc
{
int
N
;
/* number of images in mini-batch */
int
C
;
/* number of input feature maps */
int
H
;
/* height of input image */
int
W
;
/* width of input image */
int
R
;
/* kernel height */
int
S
;
/* kernel width */
int
u
;
/* vertical stride */
int
v
;
/* horizontal stride */
int
pad_h
;
/* height of logical padding of input buffer */
int
pad_w
;
/* width of logical padding of input buffer */
int
pad_h_in
;
/* height of physical zero-padding in input buffer */
int
pad_w_in
;
/* width of physical zero-padding in input buffer */
int
pad_h_out
;
/* height of physical zero-padding in output buffer */
int
pad_w_out
;
/* width of physical zero-padding in output buffer */
int
threads
;
/* number of threads used */
libxsmm_dnn_datatype
datatype_in
;
/* datatypes used for all input related buffer */
libxsmm_dnn_datatype
datatype_out
;
/* datatypes used for all output related buffer */
libxsmm_dnn_datatype
datatype_mask
;
/* datatypes used for the masks */
libxsmm_dnn_tensor_format
buffer_format
;
/* format which is for activation buffers */
libxsmm_dnn_pooling_type
pooling_type
;
/* type of pooling operation */
}
libxsmm_dnn_pooling_desc
;
LIBXSMM_API
libxsmm_dnn_pooling
*
libxsmm_dnn_create_pooling
(
libxsmm_dnn_pooling_desc
pooling_desc
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_destroy_pooling
(
const
libxsmm_dnn_pooling
*
handle
);
LIBXSMM_API
libxsmm_dnn_tensor_datalayout
*
libxsmm_dnn_pooling_create_tensor_datalayout
(
const
libxsmm_dnn_pooling
*
handle
,
const
libxsmm_dnn_tensor_type
type
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
size_t
libxsmm_dnn_pooling_get_scratch_size
(
const
libxsmm_dnn_pooling
*
handle
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_pooling_bind_scratch
(
libxsmm_dnn_pooling
*
handle
,
const
void
*
scratch
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_pooling_release_scratch
(
libxsmm_dnn_pooling
*
handle
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_pooling_bind_tensor
(
libxsmm_dnn_pooling
*
handle
,
const
libxsmm_dnn_tensor
*
tensor
,
const
libxsmm_dnn_tensor_type
type
);
LIBXSMM_API
libxsmm_dnn_tensor
*
libxsmm_dnn_pooling_get_tensor
(
libxsmm_dnn_pooling
*
handle
,
const
libxsmm_dnn_tensor_type
type
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_pooling_release_tensor
(
libxsmm_dnn_pooling
*
handle
,
const
libxsmm_dnn_tensor_type
type
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_pooling_execute_st
(
libxsmm_dnn_pooling
*
handle
,
libxsmm_dnn_compute_kind
kind
,
/*unsigned*/
int
start_thread
,
/*unsigned*/
int
tid
);
#endif
/*LIBXSMM_DNN_POOLING_H*/
third_party/libxsmm/include/libxsmm_dnn_rnncell.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke, Kunal Banerjee (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_DNN_RNNCELL_H
#define LIBXSMM_DNN_RNNCELL_H
#include "libxsmm_dnn.h"
#include "libxsmm_dnn_tensor.h"
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_dnn_rnncell
libxsmm_dnn_rnncell
;
/** Type of algorithm used for convolutions. */
typedef
enum
libxsmm_dnn_rnncell_type
{
/** simple RNN cell with ReLU as activation function */
LIBXSMM_DNN_RNNCELL_RNN_RELU
,
/** simple RNN cell with sigmoid as activation function */
LIBXSMM_DNN_RNNCELL_RNN_SIGMOID
,
/** simple RNN cell with tanh as activation function */
LIBXSMM_DNN_RNNCELL_RNN_TANH
,
/** LSTM cell */
LIBXSMM_DNN_RNNCELL_LSTM
,
/** GRU cell */
LIBXSMM_DNN_RNNCELL_GRU
}
libxsmm_dnn_rnncell_type
;
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_dnn_rnncell_desc
{
int
threads
;
libxsmm_blasint
K
;
/* number of outputs */
libxsmm_blasint
N
;
/* size of the minibatch */
libxsmm_blasint
C
;
/* number of inputs */
libxsmm_blasint
max_T
;
/* number of time steps */
libxsmm_blasint
bk
;
libxsmm_blasint
bn
;
libxsmm_blasint
bc
;
int
use_fwd_fused_impl
;
int
fwd_block
;
int
bwdupd_block
;
libxsmm_dnn_rnncell_type
cell_type
;
/* cell type RNN ReLU, RNN Sigmoid, RNN Tanh, LSTM, GRU */
libxsmm_dnn_datatype
datatype_in
;
/* datatypes used for all input related buffer */
libxsmm_dnn_datatype
datatype_out
;
/* datatypes used for all output related buffer */
libxsmm_dnn_tensor_format
buffer_format
;
/* format which is for activation buffers */
libxsmm_dnn_tensor_format
filter_format
;
/* format which is for filter buffers */
}
libxsmm_dnn_rnncell_desc
;
LIBXSMM_API
libxsmm_dnn_rnncell
*
libxsmm_dnn_create_rnncell
(
libxsmm_dnn_rnncell_desc
rnncell_desc
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_destroy_rnncell
(
const
libxsmm_dnn_rnncell
*
handle
);
LIBXSMM_API
libxsmm_dnn_tensor_datalayout
*
libxsmm_dnn_rnncell_create_tensor_datalayout
(
const
libxsmm_dnn_rnncell
*
handle
,
const
libxsmm_dnn_tensor_type
type
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
size_t
libxsmm_dnn_rnncell_get_scratch_size
(
const
libxsmm_dnn_rnncell
*
handle
,
const
libxsmm_dnn_compute_kind
kind
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
void
*
libxsmm_dnn_rnncell_get_scratch_ptr
(
const
libxsmm_dnn_rnncell
*
handle
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_rnncell_bind_scratch
(
libxsmm_dnn_rnncell
*
handle
,
const
libxsmm_dnn_compute_kind
kind
,
const
void
*
scratch
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_rnncell_release_scratch
(
libxsmm_dnn_rnncell
*
handle
,
const
libxsmm_dnn_compute_kind
kind
);
LIBXSMM_API
size_t
libxsmm_dnn_rnncell_get_internalstate_size
(
const
libxsmm_dnn_rnncell
*
handle
,
const
libxsmm_dnn_compute_kind
kind
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
void
*
libxsmm_dnn_rnncell_get_internalstate_ptr
(
const
libxsmm_dnn_rnncell
*
handle
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_rnncell_bind_internalstate
(
libxsmm_dnn_rnncell
*
handle
,
const
libxsmm_dnn_compute_kind
kind
,
const
void
*
internalstate
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_rnncell_release_internalstate
(
libxsmm_dnn_rnncell
*
handle
,
const
libxsmm_dnn_compute_kind
kind
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_rnncell_allocate_forget_bias
(
libxsmm_dnn_rnncell
*
handle
,
const
float
forget_bias
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_rnncell_bind_tensor
(
libxsmm_dnn_rnncell
*
handle
,
const
libxsmm_dnn_tensor
*
tensor
,
const
libxsmm_dnn_tensor_type
type
);
LIBXSMM_API
libxsmm_dnn_tensor
*
libxsmm_dnn_rnncell_get_tensor
(
libxsmm_dnn_rnncell
*
handle
,
const
libxsmm_dnn_tensor_type
type
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_rnncell_release_tensor
(
libxsmm_dnn_rnncell
*
handle
,
const
libxsmm_dnn_tensor_type
type
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_rnncell_set_sequence_length
(
libxsmm_dnn_rnncell
*
handle
,
const
libxsmm_blasint
T
);
LIBXSMM_API
libxsmm_blasint
libxsmm_dnn_rnncell_get_sequence_length
(
libxsmm_dnn_rnncell
*
handle
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_rnncell_execute_st
(
libxsmm_dnn_rnncell
*
handle
,
libxsmm_dnn_compute_kind
kind
,
/*unsigned*/
int
start_thread
,
/*unsigned*/
int
tid
);
#endif
/*LIBXSMM_DNN_RNNCELL_H*/
third_party/libxsmm/include/libxsmm_dnn_softmaxloss.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_DNN_SOFTMAXLOSS_H
#define LIBXSMM_DNN_SOFTMAXLOSS_H
#include "libxsmm_dnn.h"
#include "libxsmm_dnn_tensor.h"
/** Opaque handles which represents LIBXSMM softmaxloss */
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_dnn_softmaxloss
libxsmm_dnn_softmaxloss
;
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_dnn_softmaxloss_desc
{
int
N
;
/* number of images in mini-batch */
int
C
;
/* number of input feature maps */
int
bn
;
/* requested N blocking for NCNC format */
int
bc
;
/* requested C blocking for NCNC format */
float
loss_weight
;
/* loss weight */
int
threads
;
/* number of threads used */
libxsmm_dnn_datatype
datatype
;
/* datatype used for all buffers */
libxsmm_dnn_tensor_format
buffer_format
;
/* format which is for activation buffers */
}
libxsmm_dnn_softmaxloss_desc
;
LIBXSMM_API
libxsmm_dnn_softmaxloss
*
libxsmm_dnn_create_softmaxloss
(
libxsmm_dnn_softmaxloss_desc
softmaxloss_desc
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_destroy_softmaxloss
(
const
libxsmm_dnn_softmaxloss
*
handle
);
LIBXSMM_API
libxsmm_dnn_tensor_datalayout
*
libxsmm_dnn_softmaxloss_create_tensor_datalayout
(
const
libxsmm_dnn_softmaxloss
*
handle
,
const
libxsmm_dnn_tensor_type
type
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
void
*
libxsmm_dnn_softmaxloss_get_scratch_ptr
(
const
libxsmm_dnn_softmaxloss
*
handle
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
size_t
libxsmm_dnn_softmaxloss_get_scratch_size
(
const
libxsmm_dnn_softmaxloss
*
handle
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_softmaxloss_bind_scratch
(
libxsmm_dnn_softmaxloss
*
handle
,
const
void
*
scratch
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_softmaxloss_release_scratch
(
libxsmm_dnn_softmaxloss
*
handle
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_softmaxloss_bind_tensor
(
libxsmm_dnn_softmaxloss
*
handle
,
const
libxsmm_dnn_tensor
*
tensor
,
const
libxsmm_dnn_tensor_type
type
);
LIBXSMM_API
libxsmm_dnn_tensor
*
libxsmm_dnn_softmaxloss_get_tensor
(
libxsmm_dnn_softmaxloss
*
handle
,
const
libxsmm_dnn_tensor_type
type
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_softmaxloss_release_tensor
(
libxsmm_dnn_softmaxloss
*
handle
,
const
libxsmm_dnn_tensor_type
type
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_softmaxloss_execute_st
(
libxsmm_dnn_softmaxloss
*
handle
,
libxsmm_dnn_compute_kind
kind
,
/*unsigned*/
int
start_thread
,
/*unsigned*/
int
tid
);
LIBXSMM_API
float
libxsmm_dnn_softmaxloss_get_loss
(
const
libxsmm_dnn_softmaxloss
*
handle
,
libxsmm_dnn_err_t
*
status
);
#endif
/*LIBXSMM_DNN_SOFTMAXLOSS_H*/
third_party/libxsmm/include/libxsmm_dnn_tensor.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_DNN_TENSOR_H
#define LIBXSMM_DNN_TENSOR_H
#include "libxsmm_typedefs.h"
#include "libxsmm_dnn.h"
/** Opaque handles which represents convolutions and LIBXSMM datatypes */
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_dnn_tensor
libxsmm_dnn_tensor
;
typedef
enum
libxsmm_dnn_tensor_dimtype
{
/** Mini-batch */
LIBXSMM_DNN_TENSOR_DIMTYPE_N
,
/** Image Height */
LIBXSMM_DNN_TENSOR_DIMTYPE_H
,
/** Image Width */
LIBXSMM_DNN_TENSOR_DIMTYPE_W
,
/** channels or input channels */
LIBXSMM_DNN_TENSOR_DIMTYPE_C
,
/** output channels */
LIBXSMM_DNN_TENSOR_DIMTYPE_K
,
/** kernel height */
LIBXSMM_DNN_TENSOR_DIMTYPE_R
,
/** kernel width */
LIBXSMM_DNN_TENSOR_DIMTYPE_S
,
/** sequence lenth counter */
LIBXSMM_DNN_TENSOR_DIMTYPE_T
,
/** channle group counter */
LIBXSMM_DNN_TENSOR_DIMTYPE_G
,
/** general counter */
LIBXSMM_DNN_TENSOR_DIMTYPE_X
}
libxsmm_dnn_tensor_dimtype
;
/** types of different buffers */
typedef
enum
libxsmm_dnn_tensor_type
{
/** regular input buffer */
LIBXSMM_DNN_REGULAR_INPUT
,
/** regular input buffer */
LIBXSMM_DNN_REGULAR_INPUT_ADD
,
/** regular input buffer, transpose */
LIBXSMM_DNN_REGULAR_INPUT_TRANS
,
/** gradient input buffer */
LIBXSMM_DNN_GRADIENT_INPUT
,
/** gradient input buffer */
LIBXSMM_DNN_GRADIENT_INPUT_ADD
,
/** regular output buffer */
LIBXSMM_DNN_REGULAR_OUTPUT
,
/** gradient output buffer */
LIBXSMM_DNN_GRADIENT_OUTPUT
,
/** general input type */
LIBXSMM_DNN_INPUT
,
/** general output type */
LIBXSMM_DNN_OUTPUT
,
/** general activation type */
LIBXSMM_DNN_ACTIVATION
,
/* regular filter */
LIBXSMM_DNN_REGULAR_FILTER
,
/* regular filter */
LIBXSMM_DNN_REGULAR_FILTER_TRANS
,
/* gradient filter */
LIBXSMM_DNN_GRADIENT_FILTER
,
/* master filter */
LIBXSMM_DNN_MASTER_FILTER
,
/** general filter type */
LIBXSMM_DNN_FILTER
,
/* regular bias */
LIBXSMM_DNN_REGULAR_CHANNEL_BIAS
,
/* gradient bias */
LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS
,
/* bias */
LIBXSMM_DNN_CHANNEL_BIAS
,
/* regular beta */
LIBXSMM_DNN_REGULAR_CHANNEL_BETA
,
/* gradient beta */
LIBXSMM_DNN_GRADIENT_CHANNEL_BETA
,
/* beta */
LIBXSMM_DNN_CHANNEL_BETA
,
/* regular gamma */
LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA
,
/* gradient gamma */
LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA
,
/* Gamma */
LIBXSMM_DNN_CHANNEL_GAMMA
,
/* regular beta */
LIBXSMM_DNN_CHANNEL_EXPECTVAL
,
/* regular beta */
LIBXSMM_DNN_CHANNEL_RCPSTDDEV
,
/* variance */
LIBXSMM_DNN_CHANNEL_VARIANCE
,
/** general bias type */
LIBXSMM_DNN_CHANNEL_SCALAR
,
/** Labels */
LIBXSMM_DNN_LABEL
,
/** batch stats */
LIBXSMM_DNN_BATCH_STATS
,
LIBXSMM_DNN_MAX_STATS_FWD
,
LIBXSMM_DNN_MAX_STATS_BWD
,
LIBXSMM_DNN_MAX_STATS_UPD
,
/** pooling mask */
LIBXSMM_DNN_POOLING_MASK
,
/** ReLU mask */
LIBXSMM_DNN_RELU_MASK
,
/** general type, if needed might cause API issues in copy in/out API */
LIBXSMM_DNN_TENSOR
,
/** regular input buffer */
LIBXSMM_DNN_RNN_REGULAR_INPUT
,
/** regular previous cell state buffer */
LIBXSMM_DNN_RNN_REGULAR_CS_PREV
,
/** regular previous hidden state buffer */
LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV
,
/** regular weight (LSTM: wi, wc, wf, wo) */
LIBXSMM_DNN_RNN_REGULAR_WEIGHT
,
/** regular recurrent weight (LSTM: ri, rc, rf, ro) */
LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT
,
/** regular weight (LSTM: wi, wc, wf, wo) */
LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS
,
/** regular recurrent weight (LSTM: ri, rc, rf, ro) */
LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS
,
/** regular bias (LSTM: bi, bc, bf, bo) */
LIBXSMM_DNN_RNN_REGULAR_BIAS
,
/** regular output cell state buffer */
LIBXSMM_DNN_RNN_REGULAR_CS
,
/** regular hidden state buffer */
LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE
,
/** gradient input buffer */
LIBXSMM_DNN_RNN_GRADIENT_INPUT
,
/** gradient previous cell state buffer */
LIBXSMM_DNN_RNN_GRADIENT_CS_PREV
,
/** gradient previous hidden state buffer */
LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV
,
/** gradient weight */
LIBXSMM_DNN_RNN_GRADIENT_WEIGHT
,
/** gradient recurrent weight */
LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT
,
/** gradient bias */
LIBXSMM_DNN_RNN_GRADIENT_BIAS
,
/** gradient output cell state buffer */
LIBXSMM_DNN_RNN_GRADIENT_CS
,
/** gradient hidden state buffer */
LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE
,
/** internal i buffer */
LIBXSMM_DNN_RNN_INTERNAL_I
,
/** internal f buffer */
LIBXSMM_DNN_RNN_INTERNAL_F
,
/** internal o buffer */
LIBXSMM_DNN_RNN_INTERNAL_O
,
/** internal ci buffer */
LIBXSMM_DNN_RNN_INTERNAL_CI
,
/** internal co buffer */
LIBXSMM_DNN_RNN_INTERNAL_CO
}
libxsmm_dnn_tensor_type
;
/** layout descriptor to allow external data handling
outside of LIBXSMM */
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_dnn_tensor_datalayout
{
libxsmm_dnn_tensor_dimtype
*
dim_type
;
unsigned
int
*
dim_size
;
unsigned
int
num_dims
;
libxsmm_dnn_tensor_format
format
;
/* format of activation buffer */
libxsmm_dnn_datatype
datatype
;
/* data type */
libxsmm_dnn_tensor_type
tensor_type
;
/* tensor type */
}
libxsmm_dnn_tensor_datalayout
;
/** tensorlayout handling */
LIBXSMM_API
libxsmm_dnn_tensor_datalayout
*
libxsmm_dnn_duplicate_tensor_datalayout
(
const
libxsmm_dnn_tensor_datalayout
*
layout
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_destroy_tensor_datalayout
(
libxsmm_dnn_tensor_datalayout
*
layout
);
LIBXSMM_API
unsigned
int
libxsmm_dnn_compare_tensor_datalayout
(
const
libxsmm_dnn_tensor_datalayout
*
layout_a
,
const
libxsmm_dnn_tensor_datalayout
*
layout_b
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
unsigned
int
libxsmm_dnn_get_tensor_size
(
const
libxsmm_dnn_tensor_datalayout
*
layout
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
unsigned
int
libxsmm_dnn_get_tensor_elements
(
const
libxsmm_dnn_tensor_datalayout
*
layout
,
libxsmm_dnn_err_t
*
status
);
/** Create and manage buffers, filters and bias (non-NULL if successful) */
LIBXSMM_API
libxsmm_dnn_tensor
*
libxsmm_dnn_link_tensor
(
const
libxsmm_dnn_tensor_datalayout
*
layout
,
const
void
*
data
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_tensor
*
libxsmm_dnn_link_qtensor
(
const
libxsmm_dnn_tensor_datalayout
*
layout
,
const
void
*
data
,
const
unsigned
char
exp
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_set_tensor_data_ptr
(
libxsmm_dnn_tensor
*
tensor
,
const
void
*
data
);
LIBXSMM_API
void
*
libxsmm_dnn_get_tensor_data_ptr
(
const
libxsmm_dnn_tensor
*
tensor
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_tensor_datalayout
*
libxsmm_dnn_get_tensor_datalayout
(
const
libxsmm_dnn_tensor
*
tensor
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
unsigned
char
libxsmm_dnn_get_qtensor_scf
(
const
libxsmm_dnn_tensor
*
tensor
,
libxsmm_dnn_err_t
*
status
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_set_qtensor_scf
(
libxsmm_dnn_tensor
*
tensor
,
const
unsigned
char
scf
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_destroy_tensor
(
const
libxsmm_dnn_tensor
*
tensor
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_zero_tensor
(
const
libxsmm_dnn_tensor
*
tensor
);
/**
* Copy-in/out from a plain format such [N][C][H][W] or [N][H][W][C]
*/
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_copyin_tensor
(
const
libxsmm_dnn_tensor
*
tensor
,
const
void
*
data
,
const
libxsmm_dnn_tensor_format
in_format
);
LIBXSMM_API
libxsmm_dnn_err_t
libxsmm_dnn_copyout_tensor
(
const
libxsmm_dnn_tensor
*
tensor
,
void
*
data
,
const
libxsmm_dnn_tensor_format
out_format
);
#endif
/*LIBXSMM_DNN_TENSOR_H*/
third_party/libxsmm/include/libxsmm_frontend.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_FRONTEND_H
#define LIBXSMM_FRONTEND_H
#include "libxsmm_typedefs.h"
/** Helper macros for eliding prefetch address calculations depending on prefetch scheme. */
#if !defined(_WIN32) && !defined(__CYGWIN__)
/* TODO: fully support calling convention */
#if 0 != ((LIBXSMM_PREFETCH) & 2/*AL2*/) \
|| 0 != ((LIBXSMM_PREFETCH) & 8/*AL2_AHEAD*/)
# define LIBXSMM_GEMM_PREFETCH_A(EXPR) (EXPR)
#endif
#if 0 != ((LIBXSMM_PREFETCH) & 4/*BL2_VIA_C*/) \
|| 0 != ((LIBXSMM_PREFETCH) & 16/*BL1*/)
# define LIBXSMM_GEMM_PREFETCH_B(EXPR) (EXPR)
#endif
#endif
/** Secondary helper macros derived from the above group. */
#if defined(LIBXSMM_GEMM_PREFETCH_A)
# define LIBXSMM_NOPREFETCH_A(EXPR)
#else
# define LIBXSMM_NOPREFETCH_A(EXPR) EXPR
# define LIBXSMM_GEMM_PREFETCH_A(EXPR) 0
#endif
#if defined(LIBXSMM_GEMM_PREFETCH_B)
# define LIBXSMM_NOPREFETCH_B(EXPR)
#else
# define LIBXSMM_NOPREFETCH_B(EXPR) EXPR
# define LIBXSMM_GEMM_PREFETCH_B(EXPR) 0
#endif
#if defined(LIBXSMM_GEMM_PREFETCH_C)
# define LIBXSMM_NOPREFETCH_C(EXPR)
#else
# define LIBXSMM_NOPREFETCH_C(EXPR) EXPR
# define LIBXSMM_GEMM_PREFETCH_C(EXPR) 0
#endif
/** MKL_DIRECT_CALL requires to include the MKL interface. */
#if (defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL) || \
(defined(__MKL) && !defined(LIBXSMM_BUILD) && \
(!defined(__BLAS) || (0 != __BLAS))))
# if (0 != LIBXSMM_ILP64 && !defined(MKL_ILP64))
# error "Inconsistent ILP64 configuration detected!"
# endif
# if defined(LIBXSMM_OFFLOAD_BUILD)
# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
# include <mkl.h>
# pragma offload_attribute(pop)
# else
# include <mkl.h>
# endif
#endif
/** __INTEL_MKL__ is needed later to fix some NOTHROW issue. */
#if defined(__MKL) && !defined(__INTEL_MKL__) && defined(NOTHROW)
# if defined(LIBXSMM_OFFLOAD_BUILD)
# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
# include <mkl_version.h>
# pragma offload_attribute(pop)
# else
# include <mkl_version.h>
# endif
#endif
/** Unfortunately calculation of INTEL_MKL_VERSION is not stable over time. */
#if defined(__INTEL_MKL__) && defined(__INTEL_MKL_MINOR__) && defined(__INTEL_MKL_UPDATE__)
# define LIBXSMM_MKL_VERSION3 LIBXSMM_VERSION3(__INTEL_MKL__, __INTEL_MKL_MINOR__, __INTEL_MKL_UPDATE__)
#endif
/** Automatically select a prefetch-strategy (libxsmm_get_gemm_xprefetch, etc.). */
#define LIBXSMM_PREFETCH_AUTO -1
/** Append "_omp" postfix to the given symbol. */
#define LIBXSMM_USEOMP(FUNCTION) LIBXSMM_CONCATENATE(FUNCTION, _omp)
/** Helper macro for BLAS-style prefixes. */
#define LIBXSMM_TPREFIX_NAME(TYPE) LIBXSMM_CONCATENATE(LIBXSMM_TPREFIX_, TYPE)
#define LIBXSMM_TPREFIX(TYPE, FUNCTION) LIBXSMM_CONCATENATE(LIBXSMM_TPREFIX_NAME(TYPE), FUNCTION)
#define LIBXSMM_TPREFIX_doubledouble d
#define LIBXSMM_TPREFIX_floatfloat s
#define LIBXSMM_TPREFIX_shortfloat ws
#define LIBXSMM_TPREFIX_shortint wi
#define LIBXSMM_TPREFIX_libxsmm_bfloat16float bs
/** Defaults if only the input type is specified. */
#define LIBXSMM_TPREFIX_double LIBXSMM_TPREFIX_doubledouble
#define LIBXSMM_TPREFIX_float LIBXSMM_TPREFIX_floatfloat
#define LIBXSMM_TPREFIX_short LIBXSMM_TPREFIX_shortint
#define LIBXSMM_GEMM_XFLAGS(ITYPE, OTYPE) LIBXSMM_CONCATENATE(LIBXSMM_GEMM_XFLAGS_, ITYPE)
/* ignore OTYPE for now */
#define LIBXSMM_GEMM_XFLAGS_double 0
#define LIBXSMM_GEMM_XFLAGS_float 0
#define LIBXSMM_GEMM_XFLAGS_libxsmm_bfloat16 LIBXSMM_GEMM_FLAG_VNNI_A
#define LIBXSMM_GEMM_XFLAGS_int 0
#define LIBXSMM_GEMM_XFLAGS_short 0
/** Construct symbol name from a given real type name (float, double and short). */
#define LIBXSMM_BLAS_FNTYPE(TYPE, KIND) LIBXSMM_CONCATENATE3(libxsmm_, LIBXSMM_TPREFIX(TYPE, KIND), _function)
#define LIBXSMM_MMFUNCTION_TYPE(TYPE) LIBXSMM_CONCATENATE(libxsmm_, LIBXSMM_TPREFIX(TYPE, mmfunction))
#define LIBXSMM_MMDISPATCH_SYMBOL(TYPE) LIBXSMM_CONCATENATE(libxsmm_, LIBXSMM_TPREFIX(TYPE, mmdispatch))
#define LIBXSMM_XBLAS_SYMBOL(TYPE) LIBXSMM_CONCATENATE(libxsmm_blas_, LIBXSMM_TPREFIX(TYPE, gemm))
#define LIBXSMM_XGEMM_SYMBOL(TYPE) LIBXSMM_CONCATENATE(libxsmm_, LIBXSMM_TPREFIX(TYPE, gemm))
#define LIBXSMM_YGEMM_SYMBOL(TYPE) LIBXSMM_USEOMP(LIBXSMM_XGEMM_SYMBOL(TYPE))
#define LIBXSMM_BLAS_SYMBOL(TYPE, KIND) LIBXSMM_FSYMBOL(LIBXSMM_TPREFIX(TYPE, KIND))
#define LIBXSMM_CBLAS_SYMBOL LIBXSMM_TPREFIX
#define LIBXSMM_BLAS_DECL(TYPE, KIND, DECL) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_, LIBXSMM_TPREFIX(TYPE, KIND))(DECL)
#if !defined(MKL_DIRECT_CALL_SEQ) && !defined(MKL_DIRECT_CALL)
# define LIBXSMM_BLAS_dgemm(DECL) DECL;
# define LIBXSMM_BLAS_sgemm(DECL) DECL;
# define LIBXSMM_BLAS_dgemv(DECL) DECL;
# define LIBXSMM_BLAS_sgemv(DECL) DECL;
#else
# define LIBXSMM_BLAS_dgemm
# define LIBXSMM_BLAS_sgemm
# define LIBXSMM_BLAS_dgemv
# define LIBXSMM_BLAS_sgemv
#endif
/* Construct prefix names, function type or dispatch function from given input and output types. */
#define LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) LIBXSMM_MMFUNCTION_TYPE(LIBXSMM_CONCATENATE(ITYPE, OTYPE))
#define LIBXSMM_MMDISPATCH_SYMBOL2(ITYPE, OTYPE) LIBXSMM_MMDISPATCH_SYMBOL(LIBXSMM_CONCATENATE(ITYPE, OTYPE))
#define LIBXSMM_TPREFIX_NAME2(ITYPE, OTYPE) LIBXSMM_TPREFIX_NAME(LIBXSMM_CONCATENATE(ITYPE, OTYPE))
#define LIBXSMM_TPREFIX2(ITYPE, OTYPE, FUNCTION) LIBXSMM_TPREFIX(LIBXSMM_CONCATENATE(ITYPE, OTYPE), FUNCTION)
/** Helper macro for comparing selected types. */
#define LIBXSMM_EQUAL(T1, T2) LIBXSMM_CONCATENATE3(LIBXSMM_EQUAL_, T1, T2)
#define LIBXSMM_EQUAL_floatfloat 1
#define LIBXSMM_EQUAL_doubledouble 1
#define LIBXSMM_EQUAL_floatdouble 0
#define LIBXSMM_EQUAL_doublefloat 0
#define LIBXSMM_EQUAL_shortdouble 0
#define LIBXSMM_EQUAL_shortfloat 0
#if defined(LIBXSMM_BLAS_CONST)
# undef LIBXSMM_BLAS_CONST
# define LIBXSMM_BLAS_CONST const
#elif defined(OPENBLAS_CONST)
# define LIBXSMM_BLAS_CONST OPENBLAS_CONST
#elif defined(LIBXSMM_BLAS_NONCONST) || defined(__OPENBLAS) || defined(__OPENBLAS77)
# define LIBXSMM_BLAS_CONST
#else
# define LIBXSMM_BLAS_CONST const
#endif
#if !defined(LIBXSMM_NO_BLAS)
# if (!defined(__BLAS) || (0 != __BLAS))
# define LIBXSMM_NO_BLAS 0
# define LIBXSMM_BLAS 1
# else
# define LIBXSMM_NO_BLAS 1
# define LIBXSMM_BLAS 0
# endif
#endif
#if defined(__BLAS) && (1 == __BLAS)
# if defined(__OPENBLAS)
LIBXSMM_EXTERN
void
openblas_set_num_threads
(
int
num_threads
);
# define LIBXSMM_BLAS_INIT openblas_set_num_threads(1);
# endif
#endif
#if !defined(LIBXSMM_BLAS_INIT)
# define LIBXSMM_BLAS_INIT
#endif
#if defined(LIBXSMM_BUILD)
# if defined(LIBXSMM_BUILD_EXT) && !defined(__STATIC)
# define LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_APIEXT
# elif defined(LIBXSMM_NO_BLAS) && (1 == LIBXSMM_NO_BLAS)
# define LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_API
# endif
#endif
#if !defined(LIBXSMM_BLAS_SYMBOL_VISIBILITY)
# define LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_EXTERN LIBXSMM_VISIBILITY_IMPORT LIBXSMM_RETARGETABLE
#endif
#if defined(NOTHROW)
# define LIBXSMM_BLAS_NOTHROW NOTHROW
#else
# define LIBXSMM_BLAS_NOTHROW LIBXSMM_NOEXCEPT
#endif
#define LIBXSMM_BLAS_NOEXCEPT(KIND) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_NOEXCEPT_, KIND)
#if defined(LIBXSMM_MKL_VERSION3) && (LIBXSMM_VERSION3(2020, 0, 2) <= LIBXSMM_MKL_VERSION3)
# define LIBXSMM_BLAS_NOEXCEPT_gemm_batch LIBXSMM_BLAS_NOTHROW
#else
# define LIBXSMM_BLAS_NOEXCEPT_gemm_batch
#endif
#define LIBXSMM_BLAS_NOEXCEPT_gemm LIBXSMM_BLAS_NOTHROW
#define LIBXSMM_BLAS_NOEXCEPT_gemv LIBXSMM_BLAS_NOTHROW
#define LIBXSMM_BLAS_SYMBOL_SIGNATURE_gemm_batch(CONST_STAR, STAR, TYPE) char CONST_STAR, char CONST_STAR, \
libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, \
TYPE CONST_STAR, TYPE CONST_STAR STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR STAR, libxsmm_blasint CONST_STAR, \
TYPE CONST_STAR, TYPE STAR STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR
#define LIBXSMM_BLAS_SYMBOL_SIGNATURE_gemm(CONST_STAR, STAR, TYPE) char CONST_STAR, char CONST_STAR, \
libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR, TYPE CONST_STAR, libxsmm_blasint CONST_STAR, \
TYPE CONST_STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR, TYPE STAR, libxsmm_blasint CONST_STAR
#define LIBXSMM_BLAS_SYMBOL_SIGNATURE_gemv(CONST_STAR, STAR, TYPE) char CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, \
TYPE CONST_STAR, TYPE CONST_STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR, libxsmm_blasint CONST_STAR, \
TYPE CONST_STAR, TYPE STAR, libxsmm_blasint CONST_STAR
#define LIBXSMM_BLAS_SYMBOL_SIGNATURE(CONST_STAR, STAR, TYPE, KIND) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_SYMBOL_SIGNATURE_, KIND)(CONST_STAR, STAR, TYPE)
#define LIBXSMM_BLAS_SYMBOL_FDECL(CONST_STAR, STAR, TYPE, KIND) LIBXSMM_BLAS_SYMBOL_VISIBILITY \
void LIBXSMM_BLAS_SYMBOL(TYPE, KIND)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(CONST_STAR, STAR, TYPE, KIND)) LIBXSMM_BLAS_NOEXCEPT(KIND)
#define LIBXSMM_BLAS_SYMBOL_CDECL(CONST_STAR, STAR, TYPE, KIND) LIBXSMM_BLAS_SYMBOL_VISIBILITY \
void LIBXSMM_CBLAS_SYMBOL(TYPE, KIND)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(CONST_STAR, STAR, TYPE, KIND)) LIBXSMM_BLAS_NOEXCEPT(KIND)
#if (0 != LIBXSMM_BLAS)
/* BLAS available */
# define LIBXSMM_BLAS_SYMBOL_DECL(TYPE, KIND) LIBXSMM_BLAS_DECL(TYPE, KIND, LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, TYPE, KIND))
#else
# define LIBXSMM_BLAS_SYMBOL_DECL(TYPE, KIND)
#endif
/** Helper macro consolidating the transpose requests into a set of flags. */
#define LIBXSMM_GEMM_FLAGS(TRANSA, TRANSB)
/* check for N/n rather than T/t since C/c is also valid! */
\
((('n' == (TRANSA) || *"N" == (TRANSA)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_A) \
| (('n' == (TRANSB) || *"N" == (TRANSB)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B))
/** Helper macro consolidating CBLAS transpose requests into a set of flags. */
#define LIBXSMM_GEMM_CFLAGS(TRANSA, TRANSB)
/* check for N/n rather than T/t since C/c is also valid! */
\
((CblasNoTrans == (TRANSA) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_A) \
| (CblasNoTrans == (TRANSB) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B))
/** Helper macro consolidating the transpose requests into a set of flags. */
#define LIBXSMM_GEMM_VNNI_FLAGS(TRANSA, TRANSB, VNNIA, VNNIB)
/* check for N/n rather than T/t since C/c is also valid! */
\
((('n' == (TRANSA) || *"N" == (TRANSA)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_A) \
| (('n' == (TRANSB) || *"N" == (TRANSB)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B) \
| (('n' == (VNNIA) || *"N" == (VNNIA)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_VNNI_A) \
| (('n' == (VNNIB) || *"N" == (VNNIB)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_VNNI_B))
/** Helper macro allowing NULL-requests (transposes) supplied by some default. */
#define LIBXSMM_GEMM_PFLAGS(TRANSA, TRANSB, DEFAULT) LIBXSMM_GEMM_FLAGS( \
NULL != ((const void*)(TRANSA)) ? (*(const char*)(TRANSA)) : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & (DEFAULT)) ? 'n' : 't'), \
NULL != ((const void*)(TRANSB)) ? (*(const char*)(TRANSB)) : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & (DEFAULT)) ? 'n' : 't')) \
| (~(LIBXSMM_GEMM_FLAG_TRANS_A | LIBXSMM_GEMM_FLAG_TRANS_B) & (DEFAULT))
/** Inlinable GEMM exercising the compiler's code generation (macro template). TODO: only NN is supported and SP/DP matrices. */
#define LIBXSMM_INLINE_XGEMM(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) { \
/* Use 'n' (instead of 'N') avoids warning about "no macro replacement within a character constant". */
\
const char libxsmm_inline_xgemm_transa_ = (char)(NULL != ((void*)(TRANSA)) ? (*(const char*)(TRANSA)) : \
(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & LIBXSMM_FLAGS) ? 'n' : 't')); \
const char libxsmm_inline_xgemm_transb_ = (char)(NULL != ((void*)(TRANSB)) ? (*(const char*)(TRANSB)) : \
(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & LIBXSMM_FLAGS) ? 'n' : 't')); \
const libxsmm_blasint libxsmm_inline_xgemm_m_ = *(const libxsmm_blasint*)(M);
/* must be specified */
\
const libxsmm_blasint libxsmm_inline_xgemm_k_ = (NULL != ((void*)(K)) ? (*(const libxsmm_blasint*)(K)) : libxsmm_inline_xgemm_m_); \
const libxsmm_blasint libxsmm_inline_xgemm_n_ = (NULL != ((void*)(N)) ? (*(const libxsmm_blasint*)(N)) : libxsmm_inline_xgemm_k_); \
const libxsmm_blasint libxsmm_inline_xgemm_lda_ = (NULL != ((void*)(LDA)) ? (*(const libxsmm_blasint*)(LDA)) : \
(('n' == libxsmm_inline_xgemm_transa_ || *"N" == libxsmm_inline_xgemm_transa_) ? libxsmm_inline_xgemm_m_ : libxsmm_inline_xgemm_k_)); \
const libxsmm_blasint libxsmm_inline_xgemm_ldb_ = (NULL != ((void*)(LDB)) ? (*(const libxsmm_blasint*)(LDB)) : \
(('n' == libxsmm_inline_xgemm_transb_ || *"N" == libxsmm_inline_xgemm_transb_) ? libxsmm_inline_xgemm_k_ : libxsmm_inline_xgemm_n_)); \
const libxsmm_blasint libxsmm_inline_xgemm_ldc_ = (NULL != ((void*)(LDC)) ? (*(const libxsmm_blasint*)(LDC)) : libxsmm_inline_xgemm_m_); \
const OTYPE libxsmm_inline_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \
const OTYPE libxsmm_inline_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \
libxsmm_blasint libxsmm_inline_xgemm_ni_, libxsmm_inline_xgemm_mi_ = 0, libxsmm_inline_xgemm_ki_;
/* loop induction variables */
\
LIBXSMM_ASSERT('n' == libxsmm_inline_xgemm_transa_ || *"N" == libxsmm_inline_xgemm_transa_); \
LIBXSMM_ASSERT('n' == libxsmm_inline_xgemm_transb_ || *"N" == libxsmm_inline_xgemm_transb_); \
LIBXSMM_PRAGMA_SIMD \
for (libxsmm_inline_xgemm_mi_ = 0; libxsmm_inline_xgemm_mi_ < libxsmm_inline_xgemm_m_; ++libxsmm_inline_xgemm_mi_) { \
LIBXSMM_PRAGMA_LOOP_COUNT(1, LIBXSMM_CONFIG_MAX_DIM, LIBXSMM_CONFIG_AVG_DIM) \
for (libxsmm_inline_xgemm_ki_ = 0; libxsmm_inline_xgemm_ki_ < libxsmm_inline_xgemm_k_; ++libxsmm_inline_xgemm_ki_) { \
LIBXSMM_PRAGMA_UNROLL \
for (libxsmm_inline_xgemm_ni_ = 0; libxsmm_inline_xgemm_ni_ < libxsmm_inline_xgemm_n_; ++libxsmm_inline_xgemm_ni_) { \
((OTYPE*)(C))[libxsmm_inline_xgemm_ni_*libxsmm_inline_xgemm_ldc_+libxsmm_inline_xgemm_mi_] \
= ((const ITYPE*)(B))[libxsmm_inline_xgemm_ni_*libxsmm_inline_xgemm_ldb_+libxsmm_inline_xgemm_ki_] * \
(((const ITYPE*)(A))[libxsmm_inline_xgemm_ki_*libxsmm_inline_xgemm_lda_+libxsmm_inline_xgemm_mi_] * libxsmm_inline_xgemm_alpha_) \
+ ((const OTYPE*)(C))[libxsmm_inline_xgemm_ni_*libxsmm_inline_xgemm_ldc_+libxsmm_inline_xgemm_mi_] * libxsmm_inline_xgemm_beta_; \
} \
} \
} \
}
#if (defined(LIBXSMM_INIT) || defined(LIBXSMM_CTOR))
# undef LIBXSMM_INIT
# define LIBXSMM_INIT LIBXSMM_ASSERT_MSG(1 < libxsmm_ninit, "LIBXSMM is not initialized");
# define LIBXSMM_INIT_COMPLETED
#else
# define LIBXSMM_INIT if (2 > libxsmm_ninit) libxsmm_init();
#endif
/** Map to appropriate BLAS function (or fallback). The mapping is used, e.g., inside of LIBXSMM_BLAS_XGEMM. */
#define LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, FUNCTION) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_FUNCTION_, LIBXSMM_TPREFIX2(ITYPE, OTYPE, FUNCTION))
#if (0 != LIBXSMM_BLAS)
/* Helper macro to eventually (if defined) call libxsmm_init */
# if defined(LIBXSMM_INIT_COMPLETED)
# define LIBXSMM_BLAS_FUNCTION_dgemm_batch libxsmm_original_dgemm_batch_function
# define LIBXSMM_BLAS_FUNCTION_sgemm_batch libxsmm_original_sgemm_batch_function
# define LIBXSMM_BLAS_FUNCTION_dgemm libxsmm_original_dgemm_function
# define LIBXSMM_BLAS_FUNCTION_sgemm libxsmm_original_sgemm_function
# define LIBXSMM_BLAS_FUNCTION_dgemv libxsmm_original_dgemv_function
# define LIBXSMM_BLAS_FUNCTION_sgemv libxsmm_original_sgemv_function
# else
# define LIBXSMM_BLAS_FUNCTION_dgemm_batch libxsmm_original_dgemm_batch()
# define LIBXSMM_BLAS_FUNCTION_sgemm_batch libxsmm_original_sgemm_batch()
# define LIBXSMM_BLAS_FUNCTION_dgemm libxsmm_original_dgemm()
# define LIBXSMM_BLAS_FUNCTION_sgemm libxsmm_original_sgemm()
# define LIBXSMM_BLAS_FUNCTION_dgemv libxsmm_original_dgemv()
# define LIBXSMM_BLAS_FUNCTION_sgemv libxsmm_original_sgemv()
# endif
#else
/* no BLAS */
# define LIBXSMM_BLAS_FUNCTION_dgemm_batch libxsmm_blas_error("dgemm_batch")
# define LIBXSMM_BLAS_FUNCTION_sgemm_batch libxsmm_blas_error("sgemm_batch")
# define LIBXSMM_BLAS_FUNCTION_dgemm libxsmm_blas_error("dgemm")
# define LIBXSMM_BLAS_FUNCTION_sgemm libxsmm_blas_error("sgemm")
# define LIBXSMM_BLAS_FUNCTION_dgemv libxsmm_blas_error("dgemv")
# define LIBXSMM_BLAS_FUNCTION_sgemv libxsmm_blas_error("sgemv")
#endif
/** Low-precision (BLAS-like) function symbols. */
#define LIBXSMM_BLAS_FUNCTION_wigemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
LIBXSMM_INLINE_XGEMM(short, int, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#define LIBXSMM_BLAS_FUNCTION_bsgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
LIBXSMM_INLINE_XGEMM(libxsmm_bfloat16, float, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
/** Short-cut macros to construct desired BLAS function symbol. */
#define LIBXSMM_BLAS_FUNCTION1(TYPE, FUNCTION) LIBXSMM_BLAS_FUNCTION(TYPE, TYPE, FUNCTION)
#define LIBXSMM_GEMM_BATCH_SYMBOL(TYPE) LIBXSMM_BLAS_FUNCTION1(TYPE, gemm_batch)
#define LIBXSMM_GEMM_SYMBOL(TYPE) LIBXSMM_BLAS_FUNCTION1(TYPE, gemm)
#define LIBXSMM_GEMV_SYMBOL(TYPE) LIBXSMM_BLAS_FUNCTION1(TYPE, gemv)
/** BLAS-based GEMM supplied by the linked LAPACK/BLAS library (macro template). */
#define LIBXSMM_BLAS_XGEMM(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) { \
/* Use 'n' (instead of 'N') avoids warning about "no macro replacement within a character constant". */
\
const char libxsmm_blas_xgemm_transa_ = (char)(NULL != ((void*)(TRANSA)) ? (*(const char*)(TRANSA)) : \
(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & LIBXSMM_FLAGS) ? 'n' : 't')); \
const char libxsmm_blas_xgemm_transb_ = (char)(NULL != ((void*)(TRANSB)) ? (*(const char*)(TRANSB)) : \
(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & LIBXSMM_FLAGS) ? 'n' : 't')); \
const libxsmm_blasint *const libxsmm_blas_xgemm_k_ = (NULL != ((void*)(K)) ? (K) : (M)); \
const libxsmm_blasint *const libxsmm_blas_xgemm_n_ = (NULL != ((void*)(N)) ? (N) : libxsmm_blas_xgemm_k_); \
const libxsmm_blasint libxsmm_blas_xgemm_lda_ = LIBXSMM_MAX(NULL != ((void*)(LDA)) ? *(LDA) : \
*(('n' == libxsmm_blas_xgemm_transa_ || *"N" == libxsmm_blas_xgemm_transa_) ? (M) : libxsmm_blas_xgemm_k_), 1); \
const libxsmm_blasint libxsmm_blas_xgemm_ldb_ = LIBXSMM_MAX(NULL != ((void*)(LDB)) ? *(LDB) : \
*(('n' == libxsmm_blas_xgemm_transb_ || *"N" == libxsmm_blas_xgemm_transb_) ? libxsmm_blas_xgemm_k_ : libxsmm_blas_xgemm_n_), 1); \
const libxsmm_blasint libxsmm_blas_xgemm_ldc_ = LIBXSMM_MAX(NULL != ((void*)(LDC)) ? *(LDC) : *(M), 1); \
const OTYPE libxsmm_blas_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \
const OTYPE libxsmm_blas_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \
LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, gemm)(&libxsmm_blas_xgemm_transa_, &libxsmm_blas_xgemm_transb_, \
M, libxsmm_blas_xgemm_n_, libxsmm_blas_xgemm_k_, \
&libxsmm_blas_xgemm_alpha_, (const ITYPE*)(A), &libxsmm_blas_xgemm_lda_, \
(const ITYPE*)(B), &libxsmm_blas_xgemm_ldb_, \
&libxsmm_blas_xgemm_beta_, (ITYPE*)(C), &libxsmm_blas_xgemm_ldc_); \
}
/** Helper macros for calling a dispatched function in a row/column-major aware fashion. */
#define LIBXSMM_MMCALL_ABC(FN, A, B, C) \
LIBXSMM_ASSERT(FN); FN(A, B, C)
#define LIBXSMM_MMCALL_PRF(FN, A, B, C, PA, PB, PC) { \
LIBXSMM_NOPREFETCH_A(LIBXSMM_UNUSED(PA)); \
LIBXSMM_NOPREFETCH_B(LIBXSMM_UNUSED(PB)); \
LIBXSMM_NOPREFETCH_C(LIBXSMM_UNUSED(PC)); \
LIBXSMM_ASSERT(FN); FN(A, B, C, \
LIBXSMM_GEMM_PREFETCH_A(PA), \
LIBXSMM_GEMM_PREFETCH_B(PB), \
LIBXSMM_GEMM_PREFETCH_C(PC)); \
}
#if (0
/*LIBXSMM_GEMM_PREFETCH_NONE*/
== LIBXSMM_PREFETCH)
# define LIBXSMM_MMCALL_LDX(FN, A, B, C, M, N, K, LDA, LDB, LDC) \
LIBXSMM_MMCALL_ABC(FN, A, B, C)
#else
# define LIBXSMM_MMCALL_LDX(FN, A, B, C, M, N, K, LDA, LDB, LDC) \
LIBXSMM_MMCALL_PRF(FN, A, B, C, (A) + ((size_t)LDA) * (K), (B) + ((size_t)LDB) * (N), (C) + ((size_t)LDC) * (N))
#endif
#define LIBXSMM_MMCALL(FN, A, B, C, M, N, K) LIBXSMM_MMCALL_LDX(FN, A, B, C, M, N, K, M, K, M)
/** Calculate problem size from M, N, and K using the correct integer type in order to cover the general case. */
#define LIBXSMM_MNK_SIZE(M, N, K) (((size_t)(M)) * ((size_t)(N)) * ((size_t)(K)))
/** Calculate total number of matrix-elements; matrices A, B, C are given per M, N, K, and emphasize (S) the C-size. */
#define LIBXSMM_SIZE(M, N, K, S) \
(((size_t)(M) * (size_t)(K)) + ((size_t)(K) * (size_t)(N)) + \
(((size_t)(S) * (size_t)(M) * (size_t)(N))))
/** Condition based on arithmetic intensity (AI) */
#define LIBXSMM_SMM_AI(M, N, K, S, TYPESIZE) \
((LIBXSMM_MNK_SIZE(M, N, K) * 2) <= ((size_t)(TYPESIZE) * 4
/*AI*/
* LIBXSMM_SIZE(M, N, K, S)))
/** Determine whether an SMM is suitable, i.e., small enough. */
#if !defined(LIBXSMM_THRESHOLD_AI)
/* traditional MNK-threshold */
# define LIBXSMM_SMM(M, N, K, S, TYPESIZE) (LIBXSMM_MNK_SIZE(M, N, K) <= (LIBXSMM_MAX_MNK))
#else
/* threshold based on arithmetic intensity */
# define LIBXSMM_SMM LIBXSMM_SMM_AI
#endif
/** Fall-back code paths: LIBXSMM_XGEMM_FALLBACK0, and LIBXSMM_XGEMM_FALLBACK1 (macro template). */
#if !defined(LIBXSMM_XGEMM_FALLBACK0)
# define LIBXSMM_XGEMM_FALLBACK0(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, gemm)(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#endif
#if !defined(LIBXSMM_XGEMM_FALLBACK1)
# define LIBXSMM_XGEMM_FALLBACK1(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, gemm)(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#endif
/**
* Execute a specialized function, or use a fallback code path depending on threshold (macro template).
* LIBXSMM_XGEMM_FALLBACK0 or specialized function: below LIBXSMM_MAX_MNK
* LIBXSMM_XGEMM_FALLBACK1: above LIBXSMM_MAX_MNK
*/
#define LIBXSMM_XGEMM(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) { \
const int libxsmm_xgemm_flags_ = LIBXSMM_GEMM_PFLAGS(TRANSA, TRANSB, LIBXSMM_FLAGS) | LIBXSMM_GEMM_XFLAGS(ITYPE, OTYPE); \
const libxsmm_blasint *const libxsmm_xgemm_k_ = (NULL != (K) ? (K) : (M)); \
const libxsmm_blasint *const libxsmm_xgemm_n_ = (NULL != (N) ? (N) : libxsmm_xgemm_k_); \
const libxsmm_blasint libxsmm_xgemm_lda_ = LIBXSMM_MAX(NULL != ((void*)(LDA)) ? *(LDA) : \
*(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & libxsmm_xgemm_flags_) ? (M) : libxsmm_xgemm_k_), 1); \
const libxsmm_blasint libxsmm_xgemm_ldb_ = LIBXSMM_MAX(NULL != ((void*)(LDB)) ? *(LDB) : \
*(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & libxsmm_xgemm_flags_) ? libxsmm_xgemm_k_ : libxsmm_xgemm_n_), 1); \
const libxsmm_blasint libxsmm_xgemm_ldc_ = LIBXSMM_MAX(NULL != (LDC) ? *(LDC) : *(M), 1); \
if (LIBXSMM_SMM(*(M), *libxsmm_xgemm_n_, *libxsmm_xgemm_k_, 2
/*RFO*/
, sizeof(OTYPE))) { \
const LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) libxsmm_mmfunction_ = LIBXSMM_MMDISPATCH_SYMBOL2(ITYPE, OTYPE)( \
*(M), *libxsmm_xgemm_n_, *libxsmm_xgemm_k_, &libxsmm_xgemm_lda_, &libxsmm_xgemm_ldb_, &libxsmm_xgemm_ldc_, \
(const OTYPE*)(ALPHA), (const OTYPE*)(BETA), &libxsmm_xgemm_flags_, NULL); \
if (NULL != libxsmm_mmfunction_) { \
LIBXSMM_MMCALL_LDX(libxsmm_mmfunction_, (const ITYPE*)(A), (const ITYPE*)(B), (OTYPE*)(C), \
*(M), *libxsmm_xgemm_n_, *libxsmm_xgemm_k_, libxsmm_xgemm_lda_, libxsmm_xgemm_ldb_, libxsmm_xgemm_ldc_); \
} \
else { \
const char libxsmm_xgemm_transa_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & libxsmm_xgemm_flags_) ? 'n' : 't'); \
const char libxsmm_xgemm_transb_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & libxsmm_xgemm_flags_) ? 'n' : 't'); \
const OTYPE libxsmm_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \
const OTYPE libxsmm_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \
LIBXSMM_XGEMM_FALLBACK0(ITYPE, OTYPE, &libxsmm_xgemm_transa_, &libxsmm_xgemm_transb_, \
M, libxsmm_xgemm_n_, libxsmm_xgemm_k_, \
&libxsmm_xgemm_alpha_, A, &libxsmm_xgemm_lda_, \
B, &libxsmm_xgemm_ldb_, \
&libxsmm_xgemm_beta_, C, &libxsmm_xgemm_ldc_); \
} \
} \
else { \
const char libxsmm_xgemm_transa_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & libxsmm_xgemm_flags_) ? 'n' : 't'); \
const char libxsmm_xgemm_transb_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & libxsmm_xgemm_flags_) ? 'n' : 't'); \
const OTYPE libxsmm_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \
const OTYPE libxsmm_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \
LIBXSMM_XGEMM_FALLBACK1(ITYPE, OTYPE, &libxsmm_xgemm_transa_, &libxsmm_xgemm_transb_, \
M, libxsmm_xgemm_n_, libxsmm_xgemm_k_, \
&libxsmm_xgemm_alpha_, A, &libxsmm_xgemm_lda_, \
B, &libxsmm_xgemm_ldb_, \
&libxsmm_xgemm_beta_, C, &libxsmm_xgemm_ldc_); \
} \
}
/** Helper macro to setup a matrix with some initial values. */
#define LIBXSMM_MATINIT_AUX(OMP, TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) { \
/*const*/
double libxsmm_matinit_seed_ = (double)(SEED);
/* avoid constant conditional */
\
const double libxsmm_matinit_scale_ = (SCALE) * libxsmm_matinit_seed_ + (SCALE); \
const libxsmm_blasint libxsmm_matinit_nrows_ = (libxsmm_blasint)NROWS; \
const libxsmm_blasint libxsmm_matinit_ld_ = (libxsmm_blasint)LD; \
libxsmm_blasint libxsmm_matinit_i_ = 0, libxsmm_matinit_j_ = 0; \
LIBXSMM_OMP_VAR(libxsmm_matinit_i_); LIBXSMM_OMP_VAR(libxsmm_matinit_j_); \
if (0 != libxsmm_matinit_seed_) { \
OMP(parallel for private(libxsmm_matinit_i_, libxsmm_matinit_j_)) \
for (libxsmm_matinit_i_ = 0; libxsmm_matinit_i_ < ((libxsmm_blasint)NCOLS); ++libxsmm_matinit_i_) { \
for (libxsmm_matinit_j_ = 0; libxsmm_matinit_j_ < libxsmm_matinit_nrows_; ++libxsmm_matinit_j_) { \
const libxsmm_blasint libxsmm_matinit_k_ = libxsmm_matinit_i_ * libxsmm_matinit_ld_ + libxsmm_matinit_j_; \
(DST)[libxsmm_matinit_k_] = (TYPE)(libxsmm_matinit_scale_ * (1.0 + \
libxsmm_matinit_i_ * libxsmm_matinit_nrows_ + libxsmm_matinit_j_)); \
} \
for (; libxsmm_matinit_j_ < libxsmm_matinit_ld_; ++libxsmm_matinit_j_) { \
const libxsmm_blasint libxsmm_matinit_k_ = libxsmm_matinit_i_ * libxsmm_matinit_ld_ + libxsmm_matinit_j_; \
(DST)[libxsmm_matinit_k_] = (TYPE)(SEED); \
} \
} \
} \
else {
/* shuffle based initialization */
\
const unsigned int libxsmm_matinit_maxval_ = ((unsigned int)NCOLS) * ((unsigned int)libxsmm_matinit_ld_); \
const TYPE libxsmm_matinit_maxval2_ = (TYPE)(libxsmm_matinit_maxval_ / 2), libxsmm_matinit_inv_ = (TYPE)((SCALE) / libxsmm_matinit_maxval2_); \
const size_t libxsmm_matinit_shuffle_ = libxsmm_shuffle(libxsmm_matinit_maxval_); \
OMP(parallel for private(libxsmm_matinit_i_, libxsmm_matinit_j_)) \
for (libxsmm_matinit_i_ = 0; libxsmm_matinit_i_ < ((libxsmm_blasint)NCOLS); ++libxsmm_matinit_i_) { \
for (libxsmm_matinit_j_ = 0; libxsmm_matinit_j_ < libxsmm_matinit_ld_; ++libxsmm_matinit_j_) { \
const libxsmm_blasint libxsmm_matinit_k_ = libxsmm_matinit_i_ * libxsmm_matinit_ld_ + libxsmm_matinit_j_; \
(DST)[libxsmm_matinit_k_] = libxsmm_matinit_inv_ *
/* normalize values to an interval of [-1, +1] */
\
((TYPE)(libxsmm_matinit_shuffle_ * libxsmm_matinit_k_ % libxsmm_matinit_maxval_) - libxsmm_matinit_maxval2_); \
} \
} \
} \
}
#define LIBXSMM_MATINIT(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) \
LIBXSMM_MATINIT_AUX(LIBXSMM_ELIDE, TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE)
#define LIBXSMM_MATINIT_SEQ(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) \
LIBXSMM_MATINIT(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE)
#define LIBXSMM_MATINIT_OMP(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) \
LIBXSMM_MATINIT_AUX(LIBXSMM_PRAGMA_OMP, TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE)
/** Call libxsmm_gemm_print using LIBXSMM's GEMM-flags. */
#define LIBXSMM_GEMM_PRINT(OSTREAM, PRECISION, FLAGS, M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC) \
LIBXSMM_GEMM_PRINT2(OSTREAM, PRECISION, PRECISION, FLAGS, M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC)
#define LIBXSMM_GEMM_PRINT2(OSTREAM, IPREC, OPREC, FLAGS, M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC) \
libxsmm_gemm_dprint2(OSTREAM, (libxsmm_gemm_precision)(IPREC), (libxsmm_gemm_precision)(OPREC), \
/* Use 'n' (instead of 'N') avoids warning about "no macro replacement within a character constant". */
\
(char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & (FLAGS)) ? 'n' : 't'), \
(char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & (FLAGS)) ? 'n' : 't'), \
M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC)
/**
* Utility function, which either prints information about the GEMM call
* or dumps (FILE/ostream=0) all input and output data into MHD files.
* The Meta Image Format (MHD) is suitable for visual inspection using,
* e.g., ITK-SNAP or ParaView.
*/
LIBXSMM_API
void
libxsmm_gemm_print
(
void
*
ostream
,
libxsmm_gemm_precision
precision
,
const
char
*
transa
,
const
char
*
transb
,
const
libxsmm_blasint
*
m
,
const
libxsmm_blasint
*
n
,
const
libxsmm_blasint
*
k
,
const
void
*
alpha
,
const
void
*
a
,
const
libxsmm_blasint
*
lda
,
const
void
*
b
,
const
libxsmm_blasint
*
ldb
,
const
void
*
beta
,
void
*
c
,
const
libxsmm_blasint
*
ldc
);
LIBXSMM_API
void
libxsmm_gemm_print2
(
void
*
ostream
,
libxsmm_gemm_precision
iprec
,
libxsmm_gemm_precision
oprec
,
const
char
*
transa
,
const
char
*
transb
,
const
libxsmm_blasint
*
m
,
const
libxsmm_blasint
*
n
,
const
libxsmm_blasint
*
k
,
const
void
*
alpha
,
const
void
*
a
,
const
libxsmm_blasint
*
lda
,
const
void
*
b
,
const
libxsmm_blasint
*
ldb
,
const
void
*
beta
,
void
*
c
,
const
libxsmm_blasint
*
ldc
);
LIBXSMM_API
void
libxsmm_gemm_dprint
(
void
*
ostream
,
libxsmm_gemm_precision
precision
,
char
transa
,
char
transb
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
k
,
double
dalpha
,
const
void
*
a
,
libxsmm_blasint
lda
,
const
void
*
b
,
libxsmm_blasint
ldb
,
double
dbeta
,
void
*
c
,
libxsmm_blasint
ldc
);
LIBXSMM_API
void
libxsmm_gemm_dprint2
(
void
*
ostream
,
libxsmm_gemm_precision
iprec
,
libxsmm_gemm_precision
oprec
,
char
transa
,
char
transb
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
k
,
double
dalpha
,
const
void
*
a
,
libxsmm_blasint
lda
,
const
void
*
b
,
libxsmm_blasint
ldb
,
double
dbeta
,
void
*
c
,
libxsmm_blasint
ldc
);
LIBXSMM_API
void
libxsmm_gemm_xprint
(
void
*
ostream
,
libxsmm_xmmfunction
kernel
,
const
void
*
a
,
const
void
*
b
,
void
*
c
);
/** GEMM_BATCH: fallback prototype functions served by any compliant LAPACK/BLAS. */
LIBXSMM_EXTERN_C
typedef
LIBXSMM_RETARGETABLE
void
(
*
libxsmm_dgemm_batch_function
)(
LIBXSMM_BLAS_SYMBOL_SIGNATURE
(
const
*
,
*
,
double
,
gemm_batch
));
LIBXSMM_EXTERN_C
typedef
LIBXSMM_RETARGETABLE
void
(
*
libxsmm_sgemm_batch_function
)(
LIBXSMM_BLAS_SYMBOL_SIGNATURE
(
const
*
,
*
,
float
,
gemm_batch
));
/** GEMM: fallback prototype functions served by any compliant LAPACK/BLAS. */
LIBXSMM_EXTERN_C
typedef
LIBXSMM_RETARGETABLE
void
(
*
libxsmm_dgemm_function
)(
LIBXSMM_BLAS_SYMBOL_SIGNATURE
(
const
*
,
*
,
double
,
gemm
));
LIBXSMM_EXTERN_C
typedef
LIBXSMM_RETARGETABLE
void
(
*
libxsmm_sgemm_function
)(
LIBXSMM_BLAS_SYMBOL_SIGNATURE
(
const
*
,
*
,
float
,
gemm
));
/** GEMV: fallback prototype functions served by any compliant LAPACK/BLAS. */
LIBXSMM_EXTERN_C
typedef
LIBXSMM_RETARGETABLE
void
(
*
libxsmm_dgemv_function
)(
LIBXSMM_BLAS_SYMBOL_SIGNATURE
(
const
*
,
*
,
double
,
gemv
));
LIBXSMM_EXTERN_C
typedef
LIBXSMM_RETARGETABLE
void
(
*
libxsmm_sgemv_function
)(
LIBXSMM_BLAS_SYMBOL_SIGNATURE
(
const
*
,
*
,
float
,
gemv
));
/** Helper function to consume arguments when called. */
LIBXSMM_EXTERN_C
typedef
LIBXSMM_RETARGETABLE
void
(
*
libxsmm_sink_function
)(
LIBXSMM_VARIADIC
);
/** The original BLAS functions. */
LIBXSMM_APIVAR_PUBLIC
(
/*volatile*/
libxsmm_dgemm_batch_function
libxsmm_original_dgemm_batch_function
);
LIBXSMM_APIVAR_PUBLIC
(
/*volatile*/
libxsmm_sgemm_batch_function
libxsmm_original_sgemm_batch_function
);
LIBXSMM_APIVAR_PUBLIC
(
/*volatile*/
libxsmm_dgemm_function
libxsmm_original_dgemm_function
);
LIBXSMM_APIVAR_PUBLIC
(
/*volatile*/
libxsmm_sgemm_function
libxsmm_original_sgemm_function
);
LIBXSMM_APIVAR_PUBLIC
(
/*volatile*/
libxsmm_dgemv_function
libxsmm_original_dgemv_function
);
LIBXSMM_APIVAR_PUBLIC
(
/*volatile*/
libxsmm_sgemv_function
libxsmm_original_sgemv_function
);
LIBXSMM_API
libxsmm_dgemm_batch_function
libxsmm_original_dgemm_batch
(
void
);
LIBXSMM_API
libxsmm_sgemm_batch_function
libxsmm_original_sgemm_batch
(
void
);
LIBXSMM_API
libxsmm_dgemm_function
libxsmm_original_dgemm
(
void
);
LIBXSMM_API
libxsmm_sgemm_function
libxsmm_original_sgemm
(
void
);
LIBXSMM_API
libxsmm_dgemv_function
libxsmm_original_dgemv
(
void
);
LIBXSMM_API
libxsmm_sgemv_function
libxsmm_original_sgemv
(
void
);
LIBXSMM_API
libxsmm_sink_function
libxsmm_blas_error
(
const
char
*
symbol
);
LIBXSMM_API
void
libxsmm_sink
(
LIBXSMM_VARIADIC
);
/**
* General dense matrix multiplication, which re-exposes LAPACK/BLAS
* but allows to rely on LIBXSMM's defaults (libxsmm_config.h)
* when supplying NULL-arguments in certain places.
*/
LIBXSMM_API
void
libxsmm_blas_xgemm
(
libxsmm_gemm_precision
iprec
,
libxsmm_gemm_precision
oprec
,
const
char
*
transa
,
const
char
*
transb
,
const
libxsmm_blasint
*
m
,
const
libxsmm_blasint
*
n
,
const
libxsmm_blasint
*
k
,
const
void
*
alpha
,
const
void
*
a
,
const
libxsmm_blasint
*
lda
,
const
void
*
b
,
const
libxsmm_blasint
*
ldb
,
const
void
*
beta
,
void
*
c
,
const
libxsmm_blasint
*
ldc
);
#define libxsmm_blas_dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
libxsmm_blas_xgemm(LIBXSMM_GEMM_PRECISION_F64, LIBXSMM_GEMM_PRECISION_F64, \
TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#define libxsmm_blas_sgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
libxsmm_blas_xgemm(LIBXSMM_GEMM_PRECISION_F32, LIBXSMM_GEMM_PRECISION_F32, \
TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#define libxsmm_dgemm_omp(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
libxsmm_xgemm_omp(LIBXSMM_GEMM_PRECISION_F64, LIBXSMM_GEMM_PRECISION_F64, \
TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#define libxsmm_sgemm_omp(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \
libxsmm_xgemm_omp(LIBXSMM_GEMM_PRECISION_F32, LIBXSMM_GEMM_PRECISION_F32, \
TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
/** Translates GEMM prefetch request into prefetch-enumeration (incl. FE's auto-prefetch). */
LIBXSMM_API
libxsmm_gemm_prefetch_type
libxsmm_get_gemm_xprefetch
(
const
int
*
prefetch
);
LIBXSMM_API
libxsmm_gemm_prefetch_type
libxsmm_get_gemm_prefetch
(
int
prefetch
);
/** Determines the given value in double-precision based on the given type. */
LIBXSMM_API
int
libxsmm_dvalue
(
libxsmm_datatype
datatype
,
const
void
*
value
,
double
*
dvalue
);
#endif
/*LIBXSMM_FRONTEND_H*/
third_party/libxsmm/include/libxsmm_fsspmdm.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_FSSPMDM_H
#define LIBXSMM_FSSPMDM_H
#include "libxsmm_typedefs.h"
/** Opaque types for fsspmdm */
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_dfsspmdm
libxsmm_dfsspmdm
;
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_sfsspmdm
libxsmm_sfsspmdm
;
LIBXSMM_API
libxsmm_dfsspmdm
*
libxsmm_dfsspmdm_create
(
libxsmm_blasint
M
,
libxsmm_blasint
N
,
libxsmm_blasint
K
,
libxsmm_blasint
lda
,
libxsmm_blasint
ldb
,
libxsmm_blasint
ldc
,
const
double
alpha
,
const
double
beta
,
libxsmm_blasint
c_is_nt
,
const
double
*
a_dense
);
LIBXSMM_API
void
libxsmm_dfsspmdm_execute
(
const
libxsmm_dfsspmdm
*
handle
,
const
double
*
B
,
double
*
C
);
LIBXSMM_API
void
libxsmm_dfsspmdm_destroy
(
libxsmm_dfsspmdm
*
handle
);
LIBXSMM_API
libxsmm_sfsspmdm
*
libxsmm_sfsspmdm_create
(
libxsmm_blasint
M
,
libxsmm_blasint
N
,
libxsmm_blasint
K
,
libxsmm_blasint
lda
,
libxsmm_blasint
ldb
,
libxsmm_blasint
ldc
,
const
float
alpha
,
const
float
beta
,
libxsmm_blasint
c_is_nt
,
const
float
*
a_dense
);
LIBXSMM_API
void
libxsmm_sfsspmdm_execute
(
const
libxsmm_sfsspmdm
*
handle
,
const
float
*
B
,
float
*
C
);
LIBXSMM_API
void
libxsmm_sfsspmdm_destroy
(
libxsmm_sfsspmdm
*
handle
);
#endif
/*LIBXSMM_FSSPMDM_H*/
third_party/libxsmm/include/libxsmm_generator.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke, Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_GENERATOR_H
#define LIBXSMM_GENERATOR_H
#include "libxsmm_typedefs.h"
#define LIBXSMM_GEMM_NO_BYPASS(FLAGS, ALPHA, BETA) ( \
0 == ((FLAGS) & (LIBXSMM_GEMM_FLAG_TRANS_A)) && \
(LIBXSMM_FEQ(1, ALPHA)
/*|| LIBXSMM_FEQ(-1, ALPHA)*/
) && \
(LIBXSMM_FEQ(1, BETA) || LIBXSMM_FEQ(0, BETA)))
/** Initialize GEMM descriptor as used by low-level routines (type-specific). */
LIBXSMM_API
libxsmm_gemm_descriptor
*
libxsmm_dgemm_descriptor_init
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
k
,
libxsmm_blasint
lda
,
libxsmm_blasint
ldb
,
libxsmm_blasint
ldc
,
double
alpha
,
double
beta
,
int
flags
,
int
prefetch
);
LIBXSMM_API
libxsmm_gemm_descriptor
*
libxsmm_sgemm_descriptor_init
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
k
,
libxsmm_blasint
lda
,
libxsmm_blasint
ldb
,
libxsmm_blasint
ldc
,
float
alpha
,
float
beta
,
int
flags
,
int
prefetch
);
LIBXSMM_API
libxsmm_gemm_descriptor
*
libxsmm_wigemm_descriptor_init
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
k
,
libxsmm_blasint
lda
,
libxsmm_blasint
ldb
,
libxsmm_blasint
ldc
,
int
alpha
,
int
beta
,
int
flags
,
int
prefetch
);
LIBXSMM_API
libxsmm_gemm_descriptor
*
libxsmm_bigemm_descriptor_init
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
k
,
libxsmm_blasint
lda
,
libxsmm_blasint
ldb
,
libxsmm_blasint
ldc
,
int
alpha
,
int
beta
,
int
flags
,
int
prefetch
);
LIBXSMM_API
libxsmm_gemm_descriptor
*
libxsmm_bbgemm_descriptor_init
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
k
,
libxsmm_blasint
lda
,
libxsmm_blasint
ldb
,
libxsmm_blasint
ldc
,
int
alpha
,
int
beta
,
int
flags
,
int
prefetch
);
LIBXSMM_API
libxsmm_gemm_descriptor
*
libxsmm_bsgemm_descriptor_init
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
k
,
libxsmm_blasint
lda
,
libxsmm_blasint
ldb
,
libxsmm_blasint
ldc
,
float
alpha
,
float
beta
,
int
flags
,
int
prefetch
);
LIBXSMM_API
libxsmm_gemm_descriptor
*
libxsmm_bgemm_descriptor_init
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
k
,
libxsmm_blasint
lda
,
libxsmm_blasint
ldb
,
libxsmm_blasint
ldc
,
float
alpha
,
float
beta
,
int
flags
,
int
prefetch
);
/** Initialize GEMM descriptor (generic: double-precision alpha/beta). */
LIBXSMM_API
libxsmm_gemm_descriptor
*
libxsmm_gemm_descriptor_dinit
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_gemm_precision
precision
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
k
,
libxsmm_blasint
lda
,
libxsmm_blasint
ldb
,
libxsmm_blasint
ldc
,
double
alpha
,
double
beta
,
int
flags
,
int
prefetch
);
LIBXSMM_API
libxsmm_gemm_descriptor
*
libxsmm_gemm_descriptor_dinit2
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_gemm_precision
iprec
,
libxsmm_gemm_precision
oprec
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
k
,
libxsmm_blasint
lda
,
libxsmm_blasint
ldb
,
libxsmm_blasint
ldc
,
double
alpha
,
double
beta
,
int
flags
,
int
prefetch
);
/** Initialize GEMM descriptor as used by low-level routines (generic). */
LIBXSMM_API
libxsmm_gemm_descriptor
*
libxsmm_gemm_descriptor_init
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_gemm_precision
precision
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
k
,
libxsmm_blasint
lda
,
libxsmm_blasint
ldb
,
libxsmm_blasint
ldc
,
const
void
*
alpha
,
const
void
*
beta
,
int
flags
,
int
prefetch
);
LIBXSMM_API
libxsmm_gemm_descriptor
*
libxsmm_gemm_descriptor_init2
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_gemm_precision
iprec
,
libxsmm_gemm_precision
oprec
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
k
,
libxsmm_blasint
lda
,
libxsmm_blasint
ldb
,
libxsmm_blasint
ldc
,
const
void
*
alpha
,
const
void
*
beta
,
int
flags
,
int
prefetch
);
/** Similar to libxsmm_gemm_descriptor_init2 with optional type-converted alpha/beta (dalpha/dbeta). */
LIBXSMM_API
libxsmm_gemm_descriptor
*
libxsmm_gemm_descriptor_init3
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_gemm_precision
iprec
,
libxsmm_gemm_precision
oprec
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
k
,
libxsmm_blasint
lda
,
libxsmm_blasint
ldb
,
libxsmm_blasint
ldc
,
const
void
*
alpha
,
const
void
*
beta
,
int
flags
,
int
prefetch
,
double
*
dalpha
,
double
*
dbeta
);
/** Initialize transpose descriptor as used by low-level routines. */
LIBXSMM_API
libxsmm_meltw_descriptor
*
libxsmm_meltw_descriptor_init
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_datatype
in_type
,
libxsmm_datatype
out_type
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
ldo
,
libxsmm_blasint
ldi
,
unsigned
short
flags
,
unsigned
char
param
,
unsigned
char
operation
);
LIBXSMM_API
libxsmm_meltw_descriptor
*
libxsmm_meltw_descriptor_init2
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_datatype
in_type
,
libxsmm_datatype
in2_type
,
libxsmm_datatype
out_type
,
libxsmm_datatype
out2_type
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
ldo
,
libxsmm_blasint
ldi
,
libxsmm_blasint
ldi2
,
libxsmm_blasint
ldi3
,
unsigned
short
flags
,
unsigned
char
param
,
unsigned
char
operation
);
/** Initialize matrix equation as used by low-level routines */
LIBXSMM_API
libxsmm_meqn_descriptor
*
libxsmm_meqn_descriptor_init
(
libxsmm_descriptor_blob
*
blob
,
libxsmm_datatype
type
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
libxsmm_blasint
ldo
,
unsigned
int
eqn_idx
);
/** Structure referring to the generated code with some attached information. */
LIBXSMM_EXTERN_C
typedef
struct
libxsmm_generated_code
{
void
*
generated_code
;
/** pointer to memory which can contain strings or binary code */
unsigned
int
buffer_size
;
/** total size if the buffer generated_code */
unsigned
int
code_size
;
/** size of bytes used in generated_code */
unsigned
int
code_type
;
/**
* 0: generated code contains inline assembly in a C function
* which can be dumped into a *.c/cc/cpp file
* 1: generated code contains assembly which can be
* dumped into an *.s file
* >1: generated code contains a function in binary code which can be
* called, when the code is copied into executable memory
*/
unsigned
int
last_error
;
/**
* 0: no error occurred
* >0: error code
*/
unsigned
int
arch
;
/* target arch for the current code generation task */
unsigned
int
sf_size
;
/* offset of RSP to the beginning of the stack frame
* we track this value to have RBP availbale for general compute
*/
}
libxsmm_generated_code
;
/** function to translate LIBXSMM Generator error codes to error messages */
LIBXSMM_API
const
char
*
libxsmm_strerror
(
unsigned
int
i_error_code
);
/* @TODO change int based architecture value */
LIBXSMM_API
void
libxsmm_generator_gemm_inlineasm
(
const
char
*
i_file_out
,
const
char
*
i_routine_name
,
const
libxsmm_gemm_descriptor
*
i_xgemm_desc
,
const
char
*
i_arch
);
/* @TODO change int based architecture value */
LIBXSMM_API
void
libxsmm_generator_gemm_directasm
(
const
char
*
i_file_out
,
const
char
*
i_routine_name
,
const
libxsmm_gemm_descriptor
*
i_xgemm_desc
,
const
char
*
i_arch
);
LIBXSMM_API
void
libxsmm_generator_gemm_kernel
(
libxsmm_generated_code
*
io_generated_code
,
const
libxsmm_gemm_descriptor
*
i_xgemm_desc
);
/* @TODO change int based architecture value */
LIBXSMM_API
void
libxsmm_generator_spgemm
(
const
char
*
i_file_out
,
const
char
*
i_routine_name
,
const
libxsmm_gemm_descriptor
*
i_xgemm_desc
,
const
char
*
i_arch
,
const
char
*
i_file_in
,
const
int
i_is_csr
);
/* @TODO change int based architecture value */
LIBXSMM_API
void
libxsmm_generator_spgemm_csc_kernel
(
libxsmm_generated_code
*
io_generated_code
,
const
libxsmm_gemm_descriptor
*
i_xgemm_desc
,
const
char
*
i_arch
,
const
unsigned
int
*
i_row_idx
,
const
unsigned
int
*
i_column_idx
,
const
double
*
i_values
);
/* @TODO change int based architecture value */
LIBXSMM_API
void
libxsmm_generator_spgemm_csr_kernel
(
libxsmm_generated_code
*
io_generated_code
,
const
libxsmm_gemm_descriptor
*
i_xgemm_desc
,
const
char
*
i_arch
,
const
unsigned
int
*
i_row_idx
,
const
unsigned
int
*
i_column_idx
,
const
double
*
i_values
);
/* @TODO change int based architecture value */
LIBXSMM_API
void
libxsmm_generator_spgemm_csr_reg_kernel
(
libxsmm_generated_code
*
io_generated_code
,
const
libxsmm_gemm_descriptor
*
i_xgemm_desc
,
const
char
*
i_arch
,
const
unsigned
int
*
i_row_idx
,
const
unsigned
int
*
i_column_idx
,
const
double
*
i_values
);
LIBXSMM_API
void
libxsmm_generator_packed_spgemm_csr_kernel
(
libxsmm_generated_code
*
io_generated_code
,
const
libxsmm_gemm_descriptor
*
i_xgemm_desc
,
const
unsigned
int
*
i_row_idx
,
const
unsigned
int
*
i_column_idx
,
const
void
*
i_values
,
const
unsigned
int
i_packed_width
);
LIBXSMM_API
void
libxsmm_generator_packed_spgemm_csc_kernel
(
libxsmm_generated_code
*
io_generated_code
,
const
libxsmm_gemm_descriptor
*
i_xgemm_desc
,
const
unsigned
int
*
i_row_idx
,
const
unsigned
int
*
i_column_idx
,
const
void
*
i_values
,
const
unsigned
int
i_packed_width
);
LIBXSMM_API
void
libxsmm_generator_packed_gemm_ac_rm
(
libxsmm_generated_code
*
io_generated_code
,
const
libxsmm_gemm_descriptor
*
i_xgemm_desc
,
const
unsigned
int
i_packed_width
);
LIBXSMM_API
void
libxsmm_generator_packed_gemm_bc_rm
(
libxsmm_generated_code
*
io_generated_code
,
const
libxsmm_gemm_descriptor
*
i_xgemm_desc
,
const
unsigned
int
i_packed_width
);
LIBXSMM_API
void
libxsmm_generator_mateltwise_kernel
(
libxsmm_generated_code
*
io_generated_code
,
const
libxsmm_meltw_descriptor
*
i_mateltw_desc
);
LIBXSMM_API
void
libxsmm_generator_matequation_kernel
(
libxsmm_generated_code
*
io_generated_code
,
const
libxsmm_meqn_descriptor
*
i_mateqn_desc
);
/** Initialization counter that can be used to check whether the library is initialized (!=0) or not (==0). */
LIBXSMM_APIVAR_PUBLIC
(
unsigned
int
libxsmm_ninit
);
/** Target architecture (libxsmm_get_target_archid, libxsmm_set_target_archid). */
LIBXSMM_APIVAR_PUBLIC
(
int
libxsmm_target_archid
);
/** Verbosity level (0: quiet, 1: errors, 2: warnings, 3: info, neg.: all/dump). */
LIBXSMM_APIVAR_PUBLIC
(
int
libxsmm_verbosity
);
/** Security-enhanced environment. */
LIBXSMM_APIVAR_PUBLIC
(
int
libxsmm_se
);
#endif
/*LIBXSMM_GENERATOR_H*/
third_party/libxsmm/include/libxsmm_intrinsics_x86.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_INTRINSICS_X86_H
#define LIBXSMM_INTRINSICS_X86_H
#include "libxsmm_cpuid.h"
/** Macro evaluates to LIBXSMM_ATTRIBUTE_TARGET_xxx (see below). */
#define LIBXSMM_ATTRIBUTE_TARGET(TARGET) LIBXSMM_CONCATENATE(LIBXSMM_ATTRIBUTE_TARGET_, TARGET)
#if !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_PLATFORM_X86)
# define LIBXSMM_INTRINSICS_NONE
#endif
#if
/*no intrinsics: tested with 17.x and 18.x*/
(defined(__PGI) && \
LIBXSMM_VERSION2(19, 0) > LIBXSMM_VERSION2(__PGIC__, __PGIC_MINOR__)) \
||
/*legacy*/
(defined(_CRAYC) && !defined(__GNUC__))
# if !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_INTRINSICS_STATIC)
# define LIBXSMM_INTRINSICS_NONE
# endif
#elif !defined(LIBXSMM_INTRINSICS_STATIC) && !defined(LIBXSMM_INTRINSICS_NONE) && ( \
(defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && \
LIBXSMM_VERSION2(4, 4) > LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))
/* GCC 4.4 (target-attribute) */
\
|| (defined(__clang__) && LIBXSMM_VERSION2(3, 7) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) \
|| (defined(__APPLE__) && defined(__MACH__) && !defined(LIBXSMM_INTEL_COMPILER) && defined(__clang__) && \
LIBXSMM_VERSION2(9, 0) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__)))
# define LIBXSMM_INTRINSICS_STATIC
#endif
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
#endif
/** https://github.com/intel/Immintrin-debug */
#if !defined(LIBXSMM_INTRINSICS_DEBUG) && 0
# define LIBXSMM_INTRINSICS_DEBUG
/* workarounds removed after LIBXSMM 1.16.1-1.16.1-1268 */
# include "immintrin_dbg.h"
#endif
#if defined(__MIC__) && !defined(LIBXSMM_INTRINSICS_NONE)
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_TARGET_ARCH_GENERIC
# endif
# define LIBXSMM_INTRINSICS(TARGET)
# define LIBXSMM_INTRINSICS_INCLUDE
#elif !defined(LIBXSMM_INTRINSICS_NONE)
/*!defined(__MIC__)*/
# if defined(__AVX512F__) && defined(__AVX512CD__) \
&& defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__) \
&& defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \
&& (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC)
/* TODO: check GCC, Clang, etc. */
\
|| (LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \
&& (!defined(__clang__) || (LIBXSMM_VERSION2( 9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \
&& (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(99, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__AVX512F__) && defined(__AVX512CD__) \
&& defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) && defined(__AVX512VNNI__) \
&& defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \
&& (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \
|| (LIBXSMM_VERSION2(8, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \
&& (!defined(__clang__) || (LIBXSMM_VERSION2(6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \
&& (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__AVX512F__) && defined(__AVX512CD__) \
&& defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) \
&& defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \
&& (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \
|| (LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \
&& (!defined(__clang__) || (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \
&& (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__AVX512F__) && defined(__AVX512CD__) \
&& defined(__AVX512PF__) && defined(__AVX512ER__) \
&& defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \
&& (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \
|| (LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \
&& (!defined(__clang__) || (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \
&& (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_MIC
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__AVX512F__) && defined(__AVX512CD__) \
&& defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \
&& (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \
|| (LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \
&& (!defined(__clang__) || (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \
&& (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__)
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__)
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__)
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_SSE42
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(__SSE3__)
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_SSE3
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(LIBXSMM_PLATFORM_X86)
# if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_GENERIC
# endif
# if defined(__GNUC__)
# define LIBXSMM_INTRINSICS_INCLUDE
# endif
# endif
# if defined(LIBXSMM_STATIC_TARGET_ARCH) && !defined(LIBXSMM_INTRINSICS_STATIC)
# if defined(__INTEL_COMPILER)
# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
/* TODO: compiler version check for LIBXSMM_MAX_STATIC_TARGET_ARCH */
# if 1904 <= (LIBXSMM_INTEL_COMPILER) && !defined(_WIN32)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX
# elif 1801 <= (LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX
# elif 1500 <= (LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE
# elif 1400 <= (LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_MIC
# else
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2
# endif
# endif
# define LIBXSMM_INTRINSICS(TARGET)
/*no need for target flags*/
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(_CRAYC) && defined(__GNUC__)
/* TODO: version check, e.g., LIBXSMM_VERSION2(11, 5) <= LIBXSMM_VERSION2(_RELEASE, _RELEASE_MINOR) */
# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX
# endif
# define LIBXSMM_INTRINSICS(TARGET)
/*no need for target flags*/
# define LIBXSMM_INTRINSICS_INCLUDE
# elif defined(_MSC_VER) && !defined(__clang__)
/* TODO: compiler version check for LIBXSMM_MAX_STATIC_TARGET_ARCH */
# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2
# endif
# define LIBXSMM_INTRINSICS(TARGET)
/*no need for target flags*/
# define LIBXSMM_INTRINSICS_INCLUDE
# elif (!defined(__GNUC__) || LIBXSMM_VERSION2(4, 9) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \
&& (!defined(__clang__) || LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) \
&& (!defined(__APPLE__) || !defined(__MACH__)) && !defined(__PGI) && !defined(_MSC_VER)
# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# if defined(__CYGWIN__) && !defined(LIBXSMM_INTRINSICS_DEBUG)
/* Cygwin: invalid register for .seh_savexmm */
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2
# elif (defined(__clang__) && LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX
# elif (defined(__GNUC__) && LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \
|| (defined(__clang__) && LIBXSMM_VERSION2( 9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__) && !defined(__cray__))
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX
# elif (defined(__GNUC__) && LIBXSMM_VERSION2(8, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \
|| (defined(__clang__) && LIBXSMM_VERSION2(6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX
# elif (defined(__GNUC__) && LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \
|| (defined(__clang__) && LIBXSMM_VERSION2(6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE
# else
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2
# endif
# endif
# define LIBXSMM_INTRINSICS_INCLUDE
# else
/* GCC/legacy incl. Clang */
# if defined(__clang__) && !(defined(__APPLE__) && defined(__MACH__)) && !defined(_WIN32)
# if (LIBXSMM_VERSION2(7, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
/* TODO */
/* no limitations */
# elif (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# if !defined(LIBXSMM_INTRINSICS_STATIC) && (LIBXSMM_STATIC_TARGET_ARCH < LIBXSMM_X86_AVX2
/*workaround*/
)
# define LIBXSMM_INTRINSICS_STATIC
# endif
# elif !defined(LIBXSMM_INTRINSICS_STATIC)
# define LIBXSMM_INTRINSICS_STATIC
# endif
# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# if defined(__CYGWIN__) && !defined(LIBXSMM_INTRINSICS_DEBUG)
/* Cygwin: invalid register for .seh_savexmm */
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2
# elif LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX
# elif LIBXSMM_VERSION2( 9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__) && !defined(__cray__)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX
# elif LIBXSMM_VERSION2( 6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX
# else
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE
# endif
# endif
# else
/* fallback */
# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_STATIC_TARGET_ARCH
# endif
# if !defined(LIBXSMM_INTRINSICS_STATIC) && (LIBXSMM_STATIC_TARGET_ARCH < LIBXSMM_X86_AVX2
/*workaround*/
)
# define LIBXSMM_INTRINSICS_STATIC
# endif
# endif
# if !defined(LIBXSMM_INTRINSICS_INCLUDE) && (!defined(__PGI) || LIBXSMM_VERSION2(19, 0) <= LIBXSMM_VERSION2(__PGIC__, __PGIC_MINOR__))
# define LIBXSMM_INTRINSICS_INCLUDE
# endif
# endif
/* GCC/legacy incl. Clang */
# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# error "LIBXSMM_MAX_STATIC_TARGET_ARCH not defined!"
# endif
# if defined(LIBXSMM_TARGET_ARCH) && (LIBXSMM_TARGET_ARCH < LIBXSMM_MAX_STATIC_TARGET_ARCH)
# undef LIBXSMM_MAX_STATIC_TARGET_ARCH
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_TARGET_ARCH
# endif
# if defined(LIBXSMM_INTRINSICS_INCLUDE) && !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_INTRINSICS_DEBUG)
# include <immintrin.h>
# endif
/*defined(LIBXSMM_INTRINSICS_INCLUDE)*/
# if !defined(LIBXSMM_INTRINSICS)
# if (LIBXSMM_MAX_STATIC_TARGET_ARCH > LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_INTRINSICS(TARGET) LIBXSMM_ATTRIBUTE(LIBXSMM_ATTRIBUTE_TARGET(TARGET))
/* LIBXSMM_ATTRIBUTE_TARGET_xxx is required to literally match the CPUID (libxsmm_cpuid.h)! */
# define LIBXSMM_ATTRIBUTE_TARGET_1002 target("sse2")
/* LIBXSMM_X86_GENERIC (64-bit ABI) */
# if (LIBXSMM_X86_SSE3 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1003 target("sse3")
# else
# define LIBXSMM_ATTRIBUTE_TARGET_1003 LIBXSMM_ATTRIBUTE_TARGET_1002
# endif
# if (LIBXSMM_X86_SSE42 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1004 target("sse4.1,sse4.2")
# else
# define LIBXSMM_ATTRIBUTE_TARGET_1004 LIBXSMM_ATTRIBUTE_TARGET_1003
# endif
# if (LIBXSMM_X86_AVX <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1005 target("avx")
# else
# define LIBXSMM_ATTRIBUTE_TARGET_1005 LIBXSMM_ATTRIBUTE_TARGET_1004
# endif
# if (LIBXSMM_X86_AVX2 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1006 target("avx2,fma")
# else
# define LIBXSMM_ATTRIBUTE_TARGET_1006 LIBXSMM_ATTRIBUTE_TARGET_1005
# endif
# if (LIBXSMM_X86_AVX512 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1007 target("avx2,fma,avx512f,avx512cd")
# else
# define LIBXSMM_ATTRIBUTE_TARGET_1007 LIBXSMM_ATTRIBUTE_TARGET_1006
# endif
# if (LIBXSMM_X86_AVX512_MIC <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1010 target("avx2,fma,avx512f,avx512cd,avx512pf,avx512er")
# else
/* LIBXSMM_X86_AVX512 */
# define LIBXSMM_ATTRIBUTE_TARGET_1010 LIBXSMM_ATTRIBUTE_TARGET_1007
# endif
# if (LIBXSMM_X86_AVX512_KNM <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1011 target("avx2,fma,avx512f,avx512cd,avx512pf,avx512er,avx5124vnniw,avx5124fmaps")
# else
/* LIBXSMM_X86_AVX512_MIC */
# define LIBXSMM_ATTRIBUTE_TARGET_1011 LIBXSMM_ATTRIBUTE_TARGET_1010
# endif
# if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1020 target("avx2,fma,avx512f,avx512cd,avx512dq,avx512bw,avx512vl")
# else
/* LIBXSMM_X86_AVX512 */
# define LIBXSMM_ATTRIBUTE_TARGET_1020 LIBXSMM_ATTRIBUTE_TARGET_1007
# endif
# if (LIBXSMM_X86_AVX512_CLX <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1021 target("avx2,fma,avx512f,avx512cd,avx512dq,avx512bw,avx512vl,avx512vnni")
# else
/* LIBXSMM_X86_AVX512_CORE */
# define LIBXSMM_ATTRIBUTE_TARGET_1021 LIBXSMM_ATTRIBUTE_TARGET_1020
# endif
# if (LIBXSMM_X86_AVX512_CPX <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_ATTRIBUTE_TARGET_1022 target("avx2,fma,avx512f,avx512cd,avx512dq,avx512bw,avx512vl,avx512vnni,avx512bf16")
# else
/* LIBXSMM_X86_AVX512_CORE */
# define LIBXSMM_ATTRIBUTE_TARGET_1022 LIBXSMM_ATTRIBUTE_TARGET_1021
# endif
# else
# define LIBXSMM_INTRINSICS(TARGET)
/*no need for target flags*/
# endif
# elif !defined(LIBXSMM_INTRINSICS_TARGET)
# define LIBXSMM_INTRINSICS_TARGET
# endif
/*!defined(LIBXSMM_INTRINSICS)*/
# endif
/*defined(LIBXSMM_STATIC_TARGET_ARCH)*/
#endif
/*!defined(LIBXSMM_INTRINSICS_NONE)*/
#if !defined(LIBXSMM_STATIC_TARGET_ARCH)
# if !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_INTRINSICS_STATIC)
# define LIBXSMM_INTRINSICS_NONE
# endif
# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_TARGET_ARCH_GENERIC
#endif
#if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_STATIC_TARGET_ARCH
#elif (LIBXSMM_MAX_STATIC_TARGET_ARCH < LIBXSMM_STATIC_TARGET_ARCH)
# undef LIBXSMM_MAX_STATIC_TARGET_ARCH
# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_STATIC_TARGET_ARCH
#endif
#if !defined(LIBXSMM_INTRINSICS)
# define LIBXSMM_INTRINSICS(TARGET)
#endif
/** Include basic x86 intrinsics such as __rdtsc. */
#if defined(LIBXSMM_INTRINSICS_INCLUDE) && !defined(LIBXSMM_INTRINSICS_DEBUG)
# if defined(_WIN32)
# include <intrin.h>
# elif defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) || defined(__clang__) || defined(__PGI)
# include <x86intrin.h>
# elif defined(__GNUC__) && (LIBXSMM_VERSION2(4, 4) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))
# include <x86intrin.h>
# endif
# include <xmmintrin.h>
# if defined(__SSE3__)
# include <pmmintrin.h>
# endif
#endif
#if !defined(LIBXSMM_INTRINSICS_NONE)
# if defined(_WIN32)
# include <malloc.h>
# else
# include <mm_malloc.h>
# endif
#endif
/**
* Intrinsic-specific fix-ups
*/
# define LIBXSMM_INTRINSICS_LOADU_SI128(A) _mm_loadu_si128(A)
#if !defined(LIBXSMM_INTEL_COMPILER) && defined(__clang__) && ( \
(LIBXSMM_VERSION2(3, 9) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) \
|| (LIBXSMM_VERSION2(7, 3) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__) && \
defined(__APPLE__) && defined(__MACH__)))
/* prototypes with incorrect signature: _mm512_load_ps takes DP*, _mm512_load_pd takes SP* (checked with v3.8.1) */
# define LIBXSMM_INTRINSICS_MM512_LOAD_PS(A) _mm512_loadu_ps((const double*)(A))
# define LIBXSMM_INTRINSICS_MM512_LOAD_PD(A) _mm512_loadu_pd((const float*)(A))
/* Clang misses _mm512_stream_p? (checked with v3.8.1). */
# define LIBXSMM_INTRINSICS_MM512_STREAM_SI512(A, B) _mm512_store_si512(A, B)
# define LIBXSMM_INTRINSICS_MM512_STREAM_PS(A, B) _mm512_storeu_ps(A, B)
# define LIBXSMM_INTRINSICS_MM512_STREAM_PD(A, B) _mm512_store_pd(A, B)
#else
# define LIBXSMM_INTRINSICS_MM512_LOAD_PS(A) _mm512_loadu_ps((const float*)(A))
# define LIBXSMM_INTRINSICS_MM512_LOAD_PD(A) _mm512_loadu_pd((const double*)(A))
# define LIBXSMM_INTRINSICS_MM512_STREAM_SI512(A, B) _mm512_stream_si512((__m512i*)(A), (B))
# define LIBXSMM_INTRINSICS_MM512_STREAM_PS(A, B) _mm512_stream_ps(A, B)
# define LIBXSMM_INTRINSICS_MM512_STREAM_PD(A, B) _mm512_stream_pd(A, B)
#endif
#if !defined(LIBXSMM_INTEL_COMPILER) || (defined(__clang__) && ( \
(LIBXSMM_VERSION2(8, 0) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__)))) \
|| (defined(__APPLE__) && defined(__MACH__)) || defined(__GNUC__)
# define LIBXSMM_INTRINSICS_MM256_STORE_EPI32(A, B) _mm256_storeu_si256((__m256i*)(A), B)
#else
# define LIBXSMM_INTRINSICS_MM256_STORE_EPI32(A, B) _mm256_storeu_epi32(A, B)
#endif
#if defined(LIBXSMM_INTEL_COMPILER)
# if 1600 <= (LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_INTRINSICS_MM512_SET_EPI16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \
E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0) \
_mm512_set_epi16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \
E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0)
# else
# define LIBXSMM_INTRINSICS_MM512_SET_EPI16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \
E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0) \
_mm512_castps_si512(_mm512_set_epi16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \
E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0))
# endif
#else
# define LIBXSMM_INTRINSICS_MM512_SET_EPI16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \
E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0) \
_mm512_set_epi32(((E31) << 16) | (E30), ((E29) << 16) | (E28), ((E27) << 16) | (E26), ((E25) << 16) | (E24), \
((E23) << 16) | (E22), ((E21) << 16) | (E20), ((E19) << 16) | (E18), ((E17) << 16) | (E16), \
((E15) << 16) | (E14), ((E13) << 16) | (E12), ((E11) << 16) | (E10), ((E9) << 16) | (E8), \
((E7) << 16) | (E6), ((E5) << 16) | (E4), ((E3) << 16) | (E2), ((E1) << 16) | (E0))
#endif
#if defined(LIBXSMM_INTEL_COMPILER) \
|| (defined(__GNUC__) && LIBXSMM_VERSION2(7, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \
|| (defined(__clang__) && (!defined(__APPLE__) || !defined(__MACH__)) \
&& LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# define LIBXSMM_INTRINSICS_MM512_MASK_I32GATHER_EPI32(A, B, C, D, E) _mm512_mask_i32gather_epi32(A, B, C, D, E)
# define LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(A, B) _mm512_extracti64x4_epi64(A, B)
# define LIBXSMM_INTRINSICS_MM512_ABS_PS(A) _mm512_abs_ps(A)
# define LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32() _mm512_undefined_epi32()
# define LIBXSMM_INTRINSICS_MM512_UNDEFINED() _mm512_undefined()
# define LIBXSMM_INTRINSICS_MM256_UNDEFINED_SI256() _mm256_undefined_si256()
# define LIBXSMM_INTRINSICS_MM_UNDEFINED_SI128() _mm_undefined_si128()
# define LIBXSMM_INTRINSICS_MM_UNDEFINED_PD() _mm_undefined_pd()
#else
# define LIBXSMM_INTRINSICS_MM512_MASK_I32GATHER_EPI32(A, B, C, D, E) _mm512_castps_si512(_mm512_mask_i32gather_ps( \
_mm512_castsi512_ps(A), B, C, (const float*)(D), E))
# define LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(A, B) _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(A), B))
# define LIBXSMM_INTRINSICS_MM512_ABS_PS(A) _mm512_castsi512_ps(_mm512_and_epi32( \
_mm512_castps_si512(A), _mm512_set1_epi32(0x7FFFFFFF)))
# define LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32() _mm512_set1_epi32(0)
# define LIBXSMM_INTRINSICS_MM512_UNDEFINED() _mm512_set1_ps(0)
# define LIBXSMM_INTRINSICS_MM256_UNDEFINED_SI256() _mm256_set1_epi32(0)
# define LIBXSMM_INTRINSICS_MM_UNDEFINED_SI128() _mm_set1_epi32(0)
# define LIBXSMM_INTRINSICS_MM_UNDEFINED_PD() _mm_set1_pd(0)
#endif
#if (defined(LIBXSMM_INTEL_COMPILER) && (1800 <= (LIBXSMM_INTEL_COMPILER))) \
|| (!defined(LIBXSMM_INTEL_COMPILER) && defined(__GNUC__) \
&& LIBXSMM_VERSION2(7, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \
|| ((!defined(__APPLE__) || !defined(__MACH__)) && defined(__clang__) \
&& LIBXSMM_VERSION2(8, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))
# define LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, NBITS) \
LIBXSMM_CONCATENATE(_store_mask, NBITS)((LIBXSMM_CONCATENATE(__mmask, NBITS)*)(DST_PTR), SRC)
# define LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, NBITS) \
LIBXSMM_CONCATENATE(_load_mask, NBITS)((
/*const*/
LIBXSMM_CONCATENATE(__mmask, NBITS)*)(SRC_PTR))
# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, NBITS) LIBXSMM_CONCATENATE(_cvtu32_mask, NBITS)((unsigned int)(A))
#elif defined(LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, NBITS) \
(*(LIBXSMM_CONCATENATE(__mmask, NBITS)*)(DST_PTR) = (LIBXSMM_CONCATENATE(__mmask, NBITS))(SRC))
# define LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, NBITS) \
((LIBXSMM_CONCATENATE(__mmask, NBITS))_mm512_mask2int(*(const __mmask16*)(SRC_PTR)))
# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, NBITS) LIBXSMM_CONCATENATE(LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_, NBITS)(A)
# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_32(A) ((__mmask32)(A))
# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_16(A) _mm512_int2mask((int)(A))
# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_8(A) ((__mmask8)(A))
#else
# define LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, NBITS) \
(*(LIBXSMM_CONCATENATE(__mmask, NBITS)*)(DST_PTR) = (LIBXSMM_CONCATENATE(__mmask, NBITS))(SRC))
# define LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, NBITS) (*(const LIBXSMM_CONCATENATE(__mmask, NBITS)*)(SRC_PTR))
# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, NBITS) ((LIBXSMM_CONCATENATE(__mmask, NBITS))(A))
#endif
#define LIBXSMM_INTRINSICS_MM512_STORE_MASK64(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 64)
#define LIBXSMM_INTRINSICS_MM512_STORE_MASK32(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 32)
#define LIBXSMM_INTRINSICS_MM512_STORE_MASK16(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 16)
#define LIBXSMM_INTRINSICS_MM512_STORE_MASK8(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 8)
#define LIBXSMM_INTRINSICS_MM512_LOAD_MASK64(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 64)
#define LIBXSMM_INTRINSICS_MM512_LOAD_MASK32(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 32)
#define LIBXSMM_INTRINSICS_MM512_LOAD_MASK16(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 16)
#define LIBXSMM_INTRINSICS_MM512_LOAD_MASK8(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 8)
#define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK32(A) LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, 32)
#define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK16(A) LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, 16)
#define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK8(A) LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, 8)
/**
* Pseudo intrinsics for portability
*/
LIBXSMM_API_INLINE
int
LIBXSMM_INTRINSICS_BITSCANFWD32_SW
(
unsigned
int
n
)
{
unsigned
int
i
,
r
=
0
;
if
(
0
!=
n
)
for
(
i
=
1
;
0
==
(
n
&
i
);
i
<<=
1
)
{
++
r
;
}
return
r
;
}
LIBXSMM_API_INLINE
int
LIBXSMM_INTRINSICS_BITSCANFWD64_SW
(
unsigned
long
long
n
)
{
unsigned
int
i
,
r
=
0
;
if
(
0
!=
n
)
for
(
i
=
1
;
0
==
(
n
&
i
);
i
<<=
1
)
{
++
r
;
}
return
r
;
}
/** Binary Logarithm (based on Stackoverflow's NBITSx macro). */
#define LIBXSMM_INTRINSICS_BITSCANBWD_SW02(N) (0 != ((N) & 0x2
/*0b10*/
) ? 1 : 0)
#define LIBXSMM_INTRINSICS_BITSCANBWD_SW04(N) (0 != ((N) & 0xC
/*0b1100*/
) ? (2 | LIBXSMM_INTRINSICS_BITSCANBWD_SW02((N) >> 2)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW02(N))
#define LIBXSMM_INTRINSICS_BITSCANBWD_SW08(N) (0 != ((N) & 0xF0
/*0b11110000*/
) ? (4 | LIBXSMM_INTRINSICS_BITSCANBWD_SW04((N) >> 4)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW04(N))
#define LIBXSMM_INTRINSICS_BITSCANBWD_SW16(N) (0 != ((N) & 0xFF00) ? (8 | LIBXSMM_INTRINSICS_BITSCANBWD_SW08((N) >> 8)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW08(N))
#define LIBXSMM_INTRINSICS_BITSCANBWD_SW32(N) (0 != ((N) & 0xFFFF0000) ? (16 | LIBXSMM_INTRINSICS_BITSCANBWD_SW16((N) >> 16)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW16(N))
#define LIBXSMM_INTRINSICS_BITSCANBWD_SW64(N) (0 != ((N) & 0xFFFFFFFF00000000) ? (32 | LIBXSMM_INTRINSICS_BITSCANBWD_SW32((N) >> 32)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW32(N))
#define LIBXSMM_INTRINSICS_BITSCANBWD32_SW(N) LIBXSMM_INTRINSICS_BITSCANBWD_SW32((unsigned int)(N))
#define LIBXSMM_INTRINSICS_BITSCANBWD64_SW(N) LIBXSMM_INTRINSICS_BITSCANBWD_SW64((unsigned long long)(N))
#if defined(_WIN32) && !defined(LIBXSMM_INTRINSICS_NONE)
LIBXSMM_API_INLINE
unsigned
int
LIBXSMM_INTRINSICS_BITSCANFWD32
(
unsigned
int
n
)
{
unsigned
long
r
=
0
;
_BitScanForward
(
&
r
,
n
);
return
(
0
!=
n
)
*
r
;
}
LIBXSMM_API_INLINE
unsigned
int
LIBXSMM_INTRINSICS_BITSCANBWD32
(
unsigned
int
n
)
{
unsigned
long
r
=
0
;
_BitScanReverse
(
&
r
,
n
);
return
r
;
}
# if defined(_WIN64)
LIBXSMM_API_INLINE
unsigned
int
LIBXSMM_INTRINSICS_BITSCANFWD64
(
unsigned
long
long
n
)
{
unsigned
long
r
=
0
;
_BitScanForward64
(
&
r
,
n
);
return
(
0
!=
n
)
*
r
;
}
LIBXSMM_API_INLINE
unsigned
int
LIBXSMM_INTRINSICS_BITSCANBWD64
(
unsigned
long
long
n
)
{
unsigned
long
r
=
0
;
_BitScanReverse64
(
&
r
,
n
);
return
r
;
}
# else
# define LIBXSMM_INTRINSICS_BITSCANFWD64 LIBXSMM_INTRINSICS_BITSCANFWD64_SW
# define LIBXSMM_INTRINSICS_BITSCANBWD64 LIBXSMM_INTRINSICS_BITSCANBWD64_SW
# endif
#elif defined(__GNUC__) && !defined(LIBXSMM_INTRINSICS_NONE)
# define LIBXSMM_INTRINSICS_BITSCANFWD32(N) ((0 != (N)) * __builtin_ctz(N))
# define LIBXSMM_INTRINSICS_BITSCANFWD64(N) ((0 != (N)) * __builtin_ctzll(N))
# define LIBXSMM_INTRINSICS_BITSCANBWD32(N) ((0 != (N)) * (31 - __builtin_clz(N)))
# define LIBXSMM_INTRINSICS_BITSCANBWD64(N) ((0 != (N)) * (63 - __builtin_clzll(N)))
#else
/* fallback implementation */
# define LIBXSMM_INTRINSICS_BITSCANFWD32 LIBXSMM_INTRINSICS_BITSCANFWD32_SW
# define LIBXSMM_INTRINSICS_BITSCANFWD64 LIBXSMM_INTRINSICS_BITSCANFWD64_SW
# define LIBXSMM_INTRINSICS_BITSCANBWD32 LIBXSMM_INTRINSICS_BITSCANBWD32_SW
# define LIBXSMM_INTRINSICS_BITSCANBWD64 LIBXSMM_INTRINSICS_BITSCANBWD64_SW
#endif
/** LIBXSMM_NBITS determines the minimum number of bits needed to represent N. */
#define LIBXSMM_NBITS(N) (LIBXSMM_INTRINSICS_BITSCANBWD64(N) + LIBXSMM_MIN(1, N))
#define LIBXSMM_ISQRT2(N) ((unsigned int)((1ULL << (LIBXSMM_NBITS(N) >> 1))
/*+ LIBXSMM_MIN(1, N)*/
))
/** LIBXSMM_ILOG2 definition matches ceil(log2(N)). */
LIBXSMM_API_INLINE
unsigned
int
LIBXSMM_ILOG2
(
unsigned
long
long
n
)
{
unsigned
int
result
=
0
;
if
(
1
<
n
)
{
const
unsigned
int
m
=
LIBXSMM_INTRINSICS_BITSCANBWD64
(
n
);
result
=
m
+
((
unsigned
int
)
LIBXSMM_INTRINSICS_BITSCANBWD64
(
n
-
1
)
==
m
);
}
return
result
;
}
/**
* Target attribution
*/
#if !defined(LIBXSMM_INTRINSICS_KNC) && !defined(LIBXSMM_INTRINSICS_NONE) && defined(__MIC__)
# define LIBXSMM_INTRINSICS_KNC
#endif
/** LIBXSMM_INTRINSICS_X86 is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_X86) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_GENERIC <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_GENERIC <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_X86
#endif
/** LIBXSMM_INTRINSICS_SSE3 is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_SSE3) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_SSE3 <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_SSE3 <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_SSE3
#endif
/** LIBXSMM_INTRINSICS_SSE42 is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_SSE42) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_SSE42 <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_SSE42 <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_SSE42
#endif
/** LIBXSMM_INTRINSICS_AVX is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_AVX
#endif
/** LIBXSMM_INTRINSICS_AVX2 is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX2) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX2 <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_AVX2
#endif
/** LIBXSMM_INTRINSICS_AVX512 is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX512) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512 <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_AVX512
#endif
/** LIBXSMM_INTRINSICS_AVX512_MIC is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX512_MIC) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_MIC <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_MIC <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_AVX512_MIC
#endif
/** LIBXSMM_INTRINSICS_AVX512_KNM is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX512_KNM) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_KNM <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_KNM <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_AVX512_KNM
#endif
/** LIBXSMM_INTRINSICS_AVX512_CORE is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX512_CORE) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_CORE <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_AVX512_CORE
#endif
/** LIBXSMM_INTRINSICS_AVX512_CLX is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX512_CLX) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_CLX <= LIBXSMM_STATIC_TARGET_ARCH || \
(!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_CLX <= LIBXSMM_MAX_STATIC_TARGET_ARCH))
# define LIBXSMM_INTRINSICS_AVX512_CLX
#endif
/** LIBXSMM_INTRINSICS_AVX512_CPX is defined only if the compiler is able to generate this code without special flags. */
#if !defined(LIBXSMM_INTRINSICS_AVX512_CPX) && !defined(LIBXSMM_INTRINSICS_NONE) && defined(LIBXSMM_X86_AVX512_CPX) && \
!defined(LIBXSMM_INTRINSICS_STATIC) && (LIBXSMM_X86_AVX512_CPX <= LIBXSMM_MAX_STATIC_TARGET_ARCH)
# define LIBXSMM_INTRINSICS_AVX512_CPX
#endif
/** 2048-bit state for xoshiro128+ RNG (state/symbols needed even if AVX-512 is not used) */
#define LIBXSMM_INTRINSICS_MM512_RNG_STATE(INDEX) (*(__m512i*)LIBXSMM_CONCATENATE(libxsmm_intrinsics_mm512_rng_state, INDEX))
LIBXSMM_APIVAR_PUBLIC
(
unsigned
int
libxsmm_intrinsics_mm512_rng_state0
[
16
]);
LIBXSMM_APIVAR_PUBLIC
(
unsigned
int
libxsmm_intrinsics_mm512_rng_state1
[
16
]);
LIBXSMM_APIVAR_PUBLIC
(
unsigned
int
libxsmm_intrinsics_mm512_rng_state2
[
16
]);
LIBXSMM_APIVAR_PUBLIC
(
unsigned
int
libxsmm_intrinsics_mm512_rng_state3
[
16
]);
/**
* Pseudo intrinsics (AVX-2)
*/
#if defined(LIBXSMM_INTRINSICS_AVX2)
/*__AVX2__*/
# if defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && 0
LIBXSMM_PRAGMA_OPTIMIZE_OFF
/* avoid ICE in case of symbols (-g) */
# endif
/** Generate random number in the interval [0, 1); thread save, state needs to be managed by user.
* this is based on xoshiro128+ 1.0, e.g. http://prng.di.unimi.it/xoshiro128plus.c */
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX2
)
__m256i
LIBXSMM_INTRINSICS_MM256_RNG_XOSHIRO128P_EXTSTATE_EPI32
(
unsigned
int
*
stateptr
)
{
__m256i
state_0
=
_mm256_loadu_si256
(
(
const
__m256i
*
)
stateptr
);
__m256i
state_1
=
_mm256_loadu_si256
(
(
const
__m256i
*
)(
stateptr
+
16
)
);
__m256i
state_2
=
_mm256_loadu_si256
(
(
const
__m256i
*
)(
stateptr
+
32
)
);
__m256i
state_3
=
_mm256_loadu_si256
(
(
const
__m256i
*
)(
stateptr
+
48
)
);
const
__m256i
result
=
_mm256_add_epi32
(
state_0
,
state_3
);
const
__m256i
s
=
_mm256_slli_epi32
(
state_1
,
9
);
__m256i
t
;
state_2
=
_mm256_xor_si256
(
state_2
,
state_0
);
state_3
=
_mm256_xor_si256
(
state_3
,
state_1
);
state_1
=
_mm256_xor_si256
(
state_1
,
state_2
);
state_0
=
_mm256_xor_si256
(
state_0
,
state_3
);
state_2
=
_mm256_xor_si256
(
state_2
,
s
);
_mm256_storeu_si256
(
(
__m256i
*
)
stateptr
,
state_0
);
_mm256_storeu_si256
(
(
__m256i
*
)(
stateptr
+
16
),
state_1
);
_mm256_storeu_si256
(
(
__m256i
*
)(
stateptr
+
32
),
state_2
);
t
=
_mm256_slli_epi32
(
state_3
,
11
);
state_3
=
_mm256_or_si256
(
t
,
_mm256_srli_epi32
(
state_3
,
32
-
11
));
_mm256_storeu_si256
(
(
__m256i
*
)(
stateptr
+
48
),
state_3
);
return
result
;
}
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX2
)
__m256
LIBXSMM_INTRINSICS_MM256_RNG_EXTSTATE_PS
(
unsigned
int
*
stateptr
)
{
const
__m256i
rng_mantissa
=
_mm256_srli_epi32
(
LIBXSMM_INTRINSICS_MM256_RNG_XOSHIRO128P_EXTSTATE_EPI32
(
stateptr
),
9
);
const
__m256
one
=
_mm256_set1_ps
(
1
.
0
f
);
return
_mm256_sub_ps
(
_mm256_castsi256_ps
(
_mm256_or_si256
(
_mm256_set1_epi32
(
0x3f800000
),
rng_mantissa
)),
one
);
}
# if defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && 0
LIBXSMM_PRAGMA_OPTIMIZE_ON
# endif
#endif
/*__AVX2__*/
/**
* Pseudo intrinsics (AVX-512)
*/
#if defined(LIBXSMM_INTRINSICS_AVX512)
/*__AVX512F__*/
# define LIBXSMM_INTRINSICS_MM512_QUANTIZE_NEAR_PS_EPI16( A, B ) _mm512_cvtepi32_epi16(_mm512_cvt_roundps_epi32( \
_mm512_mul_ps(LIBXSMM_INTRINSICS_MM512_LOAD_PS(A), B), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512i
LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16
(
__m512
a
)
{
const
__m512i
vnaninf
=
_mm512_set1_epi32
(
0x7f800000
),
vrneadd
=
_mm512_set1_epi32
(
0x00007fff
);
const
__m512i
vfixup
=
_mm512_set1_epi32
(
0x00000001
),
vfixupmask
=
_mm512_set1_epi32
(
0x00010000
);
const
__m512i
mm512_roundbf16rne_a_
=
_mm512_castps_si512
(
a
);
const
__mmask16
mm512_roundbf16rne_mask1_
=
_mm512_cmp_epi32_mask
(
_mm512_and_epi32
(
mm512_roundbf16rne_a_
,
vnaninf
),
vnaninf
,
_MM_CMPINT_NE
);
const
__mmask16
mm512_roundbf16rne_mask2_
=
_mm512_cmp_epi32_mask
(
_mm512_and_epi32
(
mm512_roundbf16rne_a_
,
vfixupmask
),
vfixupmask
,
_MM_CMPINT_EQ
);
return
_mm512_mask_add_epi32
(
mm512_roundbf16rne_a_
,
mm512_roundbf16rne_mask1_
,
mm512_roundbf16rne_a_
,
_mm512_mask_add_epi32
(
vrneadd
,
mm512_roundbf16rne_mask2_
,
vrneadd
,
vfixup
));
}
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m256i
LIBXSMM_INTRINSICS_MM512_CVT_FP32_BF16
(
__m512
a
)
{
return
_mm512_cvtepi32_epi16
(
_mm512_srai_epi32
(
LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16
(
a
),
16
));
}
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512i
LIBXSMM_INTRINSICS_MM512_CVT2_FP32_BF16
(
__m512
a
,
__m512
b
)
{
const
__m256i
aa
=
_mm512_cvtepi32_epi16
(
_mm512_srai_epi32
(
LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16
(
b
),
16
));
const
__m256i
bb
=
_mm512_cvtepi32_epi16
(
_mm512_srai_epi32
(
LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16
(
a
),
16
));
return
_mm512_inserti64x4
(
_mm512_inserti64x4
(
_mm512_setzero_si512
(),
aa
,
0
),
bb
,
1
);
}
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512
LIBXSMM_INTRINSICS_MM512_CVTPBH_PS
(
__m256i
a
)
{
return
_mm512_castsi512_ps
(
_mm512_slli_epi32
(
_mm512_cvtepi16_epi32
(
a
),
16
));
}
/** SVML-intrinsics */
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512
LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_78
(
__m512
x
)
{
const
__m512
c0
=
_mm512_set1_ps
(
2027025
.
0
f
);
const
__m512
c1
=
_mm512_set1_ps
(
270270
.
0
f
);
const
__m512
c2
=
_mm512_set1_ps
(
6930
.
0
f
);
const
__m512
c3
=
_mm512_set1_ps
(
36
.
0
f
);
const
__m512
c1_d
=
_mm512_set1_ps
(
945945
.
0
f
);
const
__m512
c2_d
=
_mm512_set1_ps
(
51975
.
0
f
);
const
__m512
c3_d
=
_mm512_set1_ps
(
630
.
0
f
);
const
__m512
hi_bound
=
_mm512_set1_ps
(
4
.
97
f
);
const
__m512
lo_bound
=
_mm512_set1_ps
(
-
4
.
97
f
);
const
__m512
ones
=
_mm512_set1_ps
(
1
.
0
f
);
const
__m512
neg_ones
=
_mm512_set1_ps
(
-
1
.
0
f
);
const
__m512
x2
=
_mm512_mul_ps
(
x
,
x
);
const
__m512
t1_nom
=
_mm512_fmadd_ps
(
c3
,
x2
,
c2
);
const
__m512
t2_nom
=
_mm512_fmadd_ps
(
t1_nom
,
x2
,
c1
);
const
__m512
t3_nom
=
_mm512_fmadd_ps
(
t2_nom
,
x2
,
c0
);
const
__m512
nom
=
_mm512_mul_ps
(
t3_nom
,
x
);
const
__m512
t1_denom
=
_mm512_add_ps
(
x2
,
c3_d
);
const
__m512
t2_denom
=
_mm512_fmadd_ps
(
t1_denom
,
x2
,
c2_d
);
const
__m512
t3_denom
=
_mm512_fmadd_ps
(
t2_denom
,
x2
,
c1_d
);
const
__m512
denom
=
_mm512_fmadd_ps
(
t3_denom
,
x2
,
c0
);
const
__m512
denom_rcp
=
_mm512_rcp14_ps
(
denom
);
const
__mmask16
mask_hi
=
_mm512_cmp_ps_mask
(
x
,
hi_bound
,
_CMP_GT_OQ
);
const
__mmask16
mask_lo
=
_mm512_cmp_ps_mask
(
x
,
lo_bound
,
_CMP_LT_OQ
);
__m512
result
=
_mm512_mul_ps
(
nom
,
denom_rcp
);
result
=
_mm512_mask_blend_ps
(
mask_hi
,
result
,
ones
);
result
=
_mm512_mask_blend_ps
(
mask_lo
,
result
,
neg_ones
);
return
result
;
}
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512
LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_32
(
__m512
x
)
{
const
__m512
c1
=
_mm512_set1_ps
((
float
)(
1
.
0
/
27
.
0
));
const
__m512
c2
=
_mm512_set1_ps
((
float
)(
1
.
0
/
3
));
const
__m512
hi_bound
=
_mm512_set1_ps
(
3
.
2
f
);
const
__m512
lo_bound
=
_mm512_set1_ps
(
-
3
.
2
f
);
const
__m512
ones
=
_mm512_set1_ps
(
1
.
0
f
);
const
__m512
neg_ones
=
_mm512_set1_ps
(
-
1
.
0
f
);
const
__m512
x2
=
_mm512_mul_ps
(
x
,
x
);
const
__m512
t1_nom
=
_mm512_fmadd_ps
(
x2
,
c1
,
ones
);
const
__m512
nom
=
_mm512_mul_ps
(
t1_nom
,
x
);
const
__m512
denom
=
_mm512_fmadd_ps
(
x2
,
c2
,
ones
);
const
__m512
denom_rcp
=
_mm512_rcp14_ps
(
denom
);
const
__mmask16
mask_hi
=
_mm512_cmp_ps_mask
(
x
,
hi_bound
,
_CMP_GT_OQ
);
const
__mmask16
mask_lo
=
_mm512_cmp_ps_mask
(
x
,
lo_bound
,
_CMP_LT_OQ
);
__m512
result
=
_mm512_mul_ps
(
nom
,
denom_rcp
);
result
=
_mm512_mask_blend_ps
(
mask_hi
,
result
,
ones
);
result
=
_mm512_mask_blend_ps
(
mask_lo
,
result
,
neg_ones
);
return
result
;
}
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512
LIBXSMM_INTRINSICS_MM512_TANH_PS_EXP2
(
__m512
_x
)
{
const
__m512
twice_log2_e
=
_mm512_set1_ps
((
float
)(
1
.
442695
*
2
));
const
__m512
half
=
_mm512_set1_ps
(
0
.
5
f
);
const
__m512
c2
=
_mm512_set1_ps
(
0
.
240226507
f
);
const
__m512
c1
=
_mm512_set1_ps
(
0
.
452920674
f
);
const
__m512
c0
=
_mm512_set1_ps
(
0
.
713483036
f
);
const
__m512
ones
=
_mm512_set1_ps
(
1
.
0
f
);
const
__m512
minus_twos
=
_mm512_set1_ps
(
-
2
.
0
f
);
const
__m512
x
=
_mm512_fmadd_ps
(
_x
,
twice_log2_e
,
half
);
#if 1
const
__m512
y
=
_mm512_sub_ps
(
x
,
_mm512_roundscale_round_ps
(
x
,
1
,
_MM_FROUND_CUR_DIRECTION
));
#else
const
__m512
y
=
_mm512_reduce_ps
(
x
,
1
);
#endif
const
__m512
t1
=
_mm512_fmadd_ps
(
y
,
c2
,
c1
);
const
__m512
two_to_y
=
_mm512_fmadd_ps
(
y
,
t1
,
c0
);
const
__m512
exp
=
_mm512_scalef_ps
(
two_to_y
,
x
);
const
__m512
denom_rcp
=
_mm512_rcp14_ps
(
_mm512_add_ps
(
exp
,
ones
)
);
__m512
result
=
_mm512_fmadd_ps
(
denom_rcp
,
minus_twos
,
ones
);
return
result
;
}
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512
LIBXSMM_INTRINSICS_MM512_TANH_PS_EXP3
(
__m512
_x
)
{
const
__m512
twice_log2_e
=
_mm512_set1_ps
((
float
)(
1
.
442695
*
2
));
const
__m512
half
=
_mm512_set1_ps
(
0
.
5
f
);
const
__m512
c3
=
_mm512_set1_ps
(
0
.
05550410
866
f
);
const
__m512
c2
=
_mm512_set1_ps
(
0
.
15697034396
f
);
const
__m512
c1
=
_mm512_set1_ps
(
0
.
49454875509
f
);
const
__m512
c0
=
_mm512_set1_ps
(
0
.
70654502287
f
);
const
__m512
ones
=
_mm512_set1_ps
(
1
.
0
f
);
const
__m512
minus_twos
=
_mm512_set1_ps
(
-
2
.
0
f
);
const
__m512
x
=
_mm512_fmadd_ps
(
_x
,
twice_log2_e
,
half
);
#if 1
const
__m512
y
=
_mm512_sub_ps
(
x
,
_mm512_roundscale_round_ps
(
x
,
1
,
_MM_FROUND_CUR_DIRECTION
));
#else
const
__m512
y
=
_mm512_reduce_ps
(
x
,
1
);
#endif
const
__m512
t1
=
_mm512_fmadd_ps
(
y
,
c3
,
c2
);
const
__m512
t2
=
_mm512_fmadd_ps
(
y
,
t1
,
c1
);
const
__m512
two_to_y
=
_mm512_fmadd_ps
(
y
,
t2
,
c0
);
const
__m512
exp
=
_mm512_scalef_ps
(
two_to_y
,
x
);
const
__m512
denom_rcp
=
_mm512_rcp14_ps
(
_mm512_add_ps
(
exp
,
ones
)
);
__m512
result
=
_mm512_fmadd_ps
(
denom_rcp
,
minus_twos
,
ones
);
return
result
;
}
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512
LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2
(
__m512
x
)
{
__m512
result
,
func_p0
,
func_p1
,
func_p2
;
const
__m512i
sign_mask
=
_mm512_set1_epi32
(
0x80000000
);
const
__m512i
sign_filter
=
_mm512_set1_epi32
(
0x7FFFFFFF
);
const
__m512i
lut_low
=
_mm512_set1_epi32
(
246
);
const
__m512i
lut_high
=
_mm512_set1_epi32
(
261
);
const
__m512
tanh_p0_2_reg
=
_mm512_set_ps
(
0
.
40555000
f
,
0
.
11892800
f
,
-
0
.
00
972979
f
,
-
0
.
02740300
f
,
-
0
.
016
9851
f
,
-
0
.
00776152
f
,
-
0
.
00305
889
f
,
-
0
.
0011625
9
f
,
-
0
.
00041726
f
,
-
8.53233e-6
f
,
1
.
0000000
f
,
0
.
99999800
f
,
0
.
99975400
f
,
0
.
99268200
f
,
0
.
93645300
f
,
0
.
73833900
f
);
const
__m512
tanh_p1_2_reg
=
_mm512_set_ps
(
0
.
495602
f
,
0
.
88152
f
,
1
.
125700000
f
,
1
.
17021000
f
,
1
.
1289000000
f
,
1
.
07
929000
f
,
1
.
0432300
f
,
1
.
023010
f
,
1
.
011620
f
,
1
.
00164
f
,
1.56828e-14
f
,
4.49924e-7
f
,
0
.
0000646
924
f
,
0
.
00260405
f
,
0
.
031160
8
f
,
0
.
168736
f
);
const
__m512
tanh_p2_2_reg
=
_mm512_set_ps
(
-
0
.
108238
f
,
-
0
.
2384280
f
,
-
0
.
354418000
f
,
-
0
.
38240300
f
,
-
0
.
34135700
f
,
-
0
.
274509000
f
,
-
0
.
20524900
f
,
-
0
.
1511960
f
,
-
0
.
107635
f
,
-
0
.
0466
868
f
,
-
3.60822e-16
f
,
-
2.05971e-8
f
,
-
4.24538e-6
f
,
-
0
.
00023170
9
f
,
-
0
.
003
86434
f
,
-
0
.
0277702
f
);
const
__m512i
signs
=
_mm512_and_epi32
(
_mm512_castps_si512
(
x
),
sign_mask
);
const
__m512i
abs_arg
=
_mm512_and_epi32
(
_mm512_castps_si512
(
x
),
sign_filter
);
__m512i
indices
=
_mm512_srli_epi32
(
abs_arg
,
22
);
indices
=
_mm512_max_epi32
(
indices
,
lut_low
);
indices
=
_mm512_min_epi32
(
indices
,
lut_high
);
func_p0
=
_mm512_permutexvar_ps
(
indices
,
tanh_p0_2_reg
);
func_p1
=
_mm512_permutexvar_ps
(
indices
,
tanh_p1_2_reg
);
func_p2
=
_mm512_permutexvar_ps
(
indices
,
tanh_p2_2_reg
);
result
=
_mm512_fmadd_ps
(
_mm512_castsi512_ps
(
abs_arg
),
func_p2
,
func_p1
);
result
=
_mm512_fmadd_ps
(
_mm512_castsi512_ps
(
abs_arg
),
result
,
func_p0
);
result
=
_mm512_castsi512_ps
(
_mm512_xor_epi32
(
_mm512_castps_si512
(
result
),
signs
));
return
result
;
}
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512
LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX3
(
__m512
x
)
{
__m512
result
,
func_p0
,
func_p1
,
func_p2
,
func_p3
;
const
__m512i
sign_mask
=
_mm512_set1_epi32
(
0x80000000
);
const
__m512i
sign_filter
=
_mm512_set1_epi32
(
0x7FFFFFFF
);
const
__m512i
lut_low
=
_mm512_set1_epi32
(
246
);
const
__m512i
lut_high
=
_mm512_set1_epi32
(
261
);
const
__m512
tanh_p0_3_reg
=
_mm512_setr_ps
(
0
.
466283000
f
,
0
.
82850600
f
,
0
.
97437500
f
,
0
.
99882600
f
,
0
.
9999860
f
,
1
.
0000000
f
,
-
1.50006e-08
f
,
-
7.98169e-06
f
,
-
4.53753e-05
f
,
-
0
.
00023755
f
,
-
0
.
001252
85
f
,
-
0
.
00572314
f
,
-
0
.
0227717
f
,
-
0
.
062
9089
f
,
-
0
.
084234300
f
,
0
.
0711
99800
f
);
const
__m512
tanh_p1_3_reg
=
_mm512_setr_ps
(
0
.
500617
f
,
0
.
124369
f
,
0
.
0137214
f
,
0
.
000464124
f
,
4.02465e-06
f
,
0
.
00000
f
,
1
.
00001
f
,
1
.
0002
8
f
,
1
.
00112
f
,
1
.
00414
f
,
1
.
015570
f
,
1
.
050
950
f
,
1
.
1478500
f
,
1
.
310130000
f
,
1
.
378950000
f
,
1
.
07407
f
);
const
__m512
tanh_p2_3_reg
=
_mm512_setr_ps
(
-
0
.
16133200
f
,
-
0
.
0305526
f
,
-
0
.
00245
909
f
,
-
6.12647e-05
f
,
-
3.76127e-07
f
,
0
.
000000
f
,
-
0
.
000245
872
f
,
-
0
.
00341151
f
,
-
0
.
00
971505
f
,
-
0
.
0256
817
f
,
-
0
.
06
869110
f
,
-
0
.
162433000
f
,
-
0
.
346828000
f
,
-
0
.
566516
f
,
-
0
.
640214000
f
,
-
0
.
44011900
f
);
const
__m512
tanh_p3_3_reg
=
_mm512_setr_ps
(
0
.
01773
93
f
,
0
.
00253432
f
,
0
.
000147303
f
,
2.69963e-06
f
,
1.16764e-08
f
,
0
.
0000000
f
,
-
0
.
330125
f
,
-
0
.
3176210
f
,
-
0
.
3017760
f
,
-
0
.
27358000
f
,
-
0
.
219375000
f
,
-
0
.
136197000
f
,
-
0
.
01
868680
f
,
0
.
0808901
f
,
0
.
107095
f
,
0
.
063145
9
f
);
const
__m512i
signs
=
_mm512_and_epi32
(
_mm512_castps_si512
(
x
),
sign_mask
);
const
__m512i
abs_arg
=
_mm512_and_epi32
(
_mm512_castps_si512
(
x
),
sign_filter
);
__m512i
indices
=
_mm512_srli_epi32
(
abs_arg
,
22
);
indices
=
_mm512_max_epi32
(
indices
,
lut_low
);
indices
=
_mm512_min_epi32
(
indices
,
lut_high
);
func_p0
=
_mm512_permutexvar_ps
(
indices
,
tanh_p0_3_reg
);
func_p1
=
_mm512_permutexvar_ps
(
indices
,
tanh_p1_3_reg
);
func_p2
=
_mm512_permutexvar_ps
(
indices
,
tanh_p2_3_reg
);
func_p3
=
_mm512_permutexvar_ps
(
indices
,
tanh_p3_3_reg
);
result
=
_mm512_fmadd_ps
(
_mm512_castsi512_ps
(
abs_arg
),
func_p3
,
func_p2
);
result
=
_mm512_fmadd_ps
(
_mm512_castsi512_ps
(
abs_arg
),
result
,
func_p1
);
result
=
_mm512_fmadd_ps
(
_mm512_castsi512_ps
(
abs_arg
),
result
,
func_p0
);
result
=
_mm512_castsi512_ps
(
_mm512_xor_epi32
(
_mm512_castps_si512
(
result
),
signs
));
return
result
;
}
#if defined(LIBXSMM_INTRINSICS_AVX512_CORE)
/*__AVX512DQ__ needed*/
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512_CORE
)
__m512
LIBXSMM_INTRINSICS_MM512_GELU_FWD_PS_MINIMAX3
(
__m512
x
)
{
const
__m512
thres
=
_mm512_castsi512_ps
(
_mm512_set1_epi32
(
0x40879fff
));
const
__m512
absmask
=
_mm512_castsi512_ps
(
_mm512_set1_epi32
(
0x7fffffff
));
const
__m512
scale
=
_mm512_castsi512_ps
(
_mm512_set1_epi32
(
0x406a0ea1
));
const
__m512
shifter
=
_mm512_castsi512_ps
(
_mm512_set1_epi32
(
0x4b400000
));
const
__m512
half
=
_mm512_castsi512_ps
(
_mm512_set1_epi32
(
0x3f000000
));
const
__m512
_c2
=
_mm512_castsi512_ps
(
_mm512_setr_epi32
(
0xbd877b85u
,
0xbd7d9780u
,
0xbd4cb70eu
,
0xbd08a1e9u
,
0xbc808857u
,
0xb9476fd2u
,
0x3c36f765u
,
0x3c924160u
,
0x3ca7b1fcu
,
0x3ca5732cu
,
0x3c95af63u
,
0x3c8079f7u
,
0x3c55fa4fu
,
0x3c2fa86bu
,
0x3c0fbb00u
,
0x3bec178cu
));
const
__m512
_c1
=
_mm512_castsi512_ps
(
_mm512_setr_epi32
(
0xb7c7fb58u
,
0xbacb9740u
,
0xbc3e4b3au
,
0xbd0d292au
,
0xbd8bc5d0u
,
0xbdd9978fu
,
0xbe0f92d3u
,
0xbe27b66du
,
0xbe328ce7u
,
0xbe3125bfu
,
0xbe26dc9du
,
0xbe17a056u
,
0xbe06bdebu
,
0xbdecc593u
,
0xbdcf57aau
,
0xbdb5ea3au
));
const
__m512
_c0
=
_mm512_castsi512_ps
(
_mm512_setr_epi32
(
0x3ecc4231u
,
0x3ecc541cu
,
0x3ecd6c48u
,
0x3ed174c3u
,
0x3ed9bd5du
,
0x3ee5acd5u
,
0x3ef2aeddu
,
0x3efd5384u
,
0x3f016724u
,
0x3f00f778u
,
0x3efb389eu
,
0x3ef0464du
,
0x3ee3014fu
,
0x3ed50a78u
,
0x3ec779dbu
,
0x3ebae363u
));
__m512
result
;
__m512
xr
=
_mm512_range_round_ps
(
x
,
thres
,
2
,
_MM_FROUND_NO_EXC
);
__m512
xa
=
_mm512_and_ps
(
xr
,
absmask
);
__m512
index
=
_mm512_fmadd_ps
(
xa
,
scale
,
shifter
);
__m512
c2
=
_mm512_permutexvar_ps
(
_mm512_castps_si512
(
index
),
_c2
);
__m512
c1
=
_mm512_permutexvar_ps
(
_mm512_castps_si512
(
index
),
_c1
);
__m512
c0
=
_mm512_permutexvar_ps
(
_mm512_castps_si512
(
index
),
_c0
);
__m512
poly
=
_mm512_fmadd_ps
(
c2
,
xa
,
c1
);
poly
=
_mm512_fmadd_ps
(
poly
,
xa
,
c0
);
result
=
_mm512_mul_ps
(
x
,
_mm512_fmadd_ps
(
poly
,
xr
,
half
));
return
result
;
}
#endif
/*defined(LIBXSMM_INTRINSICS_AVX512_CORE)*/
#if defined(LIBXSMM_INTRINSICS_AVX512_CORE)
/*__AVX512DQ__ needed*/
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512_CORE
)
__m512
LIBXSMM_INTRINSICS_MM512_GELU_BWD_PS_MINIMAX3
(
__m512
x
)
{
const
__m512
thres
=
_mm512_castsi512_ps
(
_mm512_set1_epi32
(
0x408f5fff
));
const
__m512
absmask
=
_mm512_castsi512_ps
(
_mm512_set1_epi32
(
0x7fffffff
));
const
__m512
scale
=
_mm512_castsi512_ps
(
_mm512_set1_epi32
(
0x405d67c9
));
const
__m512
shifter
=
_mm512_castsi512_ps
(
_mm512_set1_epi32
(
0x4b400000
));
const
__m512
half
=
_mm512_castsi512_ps
(
_mm512_set1_epi32
(
0x3f000000
));
const
__m512
_c2
=
_mm512_castsi512_ps
(
_mm512_setr_epi32
(
0xbe87047bu
,
0xbe6eb875u
,
0xbe2210c1u
,
0xbd81727fu
,
0x3cb9625cu
,
0x3da2cbe8u
,
0x3dd1d4d1u
,
0x3dca0bd0u
,
0x3da47dd0u
,
0x3d6f1bd3u
,
0x3d216381u
,
0x3cd2618cu
,
0x3c89f6e6u
,
0x3c3ca672u
,
0x3c08ed08u
,
0x3bd26a14u
));
const
__m512
_c1
=
_mm512_castsi512_ps
(
_mm512_setr_epi32
(
0xb930e738u
,
0xbc4b28bau
,
0xbda4212fu
,
0xbe5feb0eu
,
0xbec8b0e5u
,
0xbf09e61bu
,
0xbf1c403fu
,
0xbf185954u
,
0xbf03e1eeu
,
0xbed08a61u
,
0xbe9b4508u
,
0xbe61788bu
,
0xbe257770u
,
0xbdfc542au
,
0xbdca014eu
,
0xbda8d7e9u
));
const
__m512
_c0
=
_mm512_castsi512_ps
(
_mm512_setr_epi32
(
0x3f4c4245u
,
0x3f4c927bu
,
0x3f5085f8u
,
0x3f5d7bdau
,
0x3f73ea12u
,
0x3f86142fu
,
0x3f8d3df4u
,
0x3f8b4b0fu
,
0x3f8022c8u
,
0x3f5e5423u
,
0x3f39ceb5u
,
0x3f199bedu
,
0x3f00bee0u
,
0x3ede1737u
,
0x3ec59b86u
,
0x3eb4454cu
));
__m512
result
;
__m512
xr
=
_mm512_range_round_ps
(
x
,
thres
,
2
,
_MM_FROUND_NO_EXC
);
__m512
xa
=
_mm512_and_ps
(
xr
,
absmask
);
__m512
index
=
_mm512_fmadd_ps
(
xa
,
scale
,
shifter
);
__m512
c2
=
_mm512_permutexvar_ps
(
_mm512_castps_si512
(
index
),
_c2
);
__m512
c1
=
_mm512_permutexvar_ps
(
_mm512_castps_si512
(
index
),
_c1
);
__m512
c0
=
_mm512_permutexvar_ps
(
_mm512_castps_si512
(
index
),
_c0
);
__m512
poly
=
_mm512_fmadd_ps
(
c2
,
xa
,
c1
);
poly
=
_mm512_fmadd_ps
(
poly
,
xa
,
c0
);
result
=
_mm512_fmadd_ps
(
poly
,
xr
,
half
);
return
result
;
}
#endif
/*defined(LIBXSMM_INTRINSICS_AVX512_CORE)*/
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512
LIBXSMM_INTRINSICS_MM512_TANH_PS_GELU_FWD
(
__m512
x
)
{
const
__m512
c1
=
_mm512_set1_ps
(
(
float
)
0
.
79788
);
const
__m512
c2
=
_mm512_set1_ps
(
(
float
)
0
.
0356
8
);
const
__m512
c_half
=
_mm512_set1_ps
(
(
float
)
0
.
5
);
__m512
x_half
=
_mm512_mul_ps
(
x
,
c_half
);
__m512
x_sq
=
_mm512_mul_ps
(
x
,
x
);
__m512
poly_x1
=
_mm512_mul_ps
(
x
,
_mm512_fmadd_ps
(
x_sq
,
c2
,
c1
));
__m512
tanh_poly_x
=
LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2
(
poly_x1
);
__m512
output
=
_mm512_fmadd_ps
(
tanh_poly_x
,
x_half
,
x_half
);
return
output
;
}
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512
LIBXSMM_INTRINSICS_MM512_TANH_PS_GELU_BWD
(
__m512
x
)
{
const
__m512
c1
=
_mm512_set1_ps
(
(
float
)
0
.
79788
);
const
__m512
c2
=
_mm512_set1_ps
(
(
float
)
0
.
0356
8
);
const
__m512
c3
=
_mm512_set1_ps
(
(
float
)
0
.
05352
);
const
__m512
c4
=
_mm512_set1_ps
(
(
float
)
0
.
39894
);
const
__m512
c_half
=
_mm512_set1_ps
(
(
float
)
0
.
5
);
const
__m512
c_ones
=
_mm512_set1_ps
(
(
float
)
1
.
0
);
const
__m512
c_minus_1
=
_mm512_set1_ps
(
(
float
)
-
1
.
0
);
__m512
x_sq
=
_mm512_mul_ps
(
x
,
x
);
__m512
poly_x1
=
_mm512_mul_ps
(
x
,
_mm512_fmadd_ps
(
x_sq
,
c2
,
c1
));
__m512
poly_x2
=
_mm512_mul_ps
(
x
,
_mm512_fmadd_ps
(
x_sq
,
c3
,
c4
));
__m512
tanh_poly_x
=
LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2
(
poly_x1
);
__m512
out1
=
_mm512_add_ps
(
c_ones
,
tanh_poly_x
);
__m512
out2
=
_mm512_add_ps
(
c_half
,
poly_x2
);
__m512
out3
=
_mm512_fmsub_ps
(
poly_x2
,
tanh_poly_x
,
out2
);
__m512
out4
=
_mm512_mul_ps
(
c_minus_1
,
out3
);
__m512
output
=
_mm512_mul_ps
(
out1
,
out4
);
return
output
;
}
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512
LIBXSMM_INTRINSICS_MM512_EXP_PS_2DTS
(
__m512
in
)
{
const
__m512
log2_e
=
_mm512_set1_ps
(
1
.
442695
f
);
const
__m512
half
=
_mm512_set1_ps
(
0
.
5
f
);
const
__m512
c2
=
_mm512_set1_ps
(
0
.
240226507
f
);
const
__m512
c1
=
_mm512_set1_ps
(
0
.
452920674
f
);
const
__m512
c0
=
_mm512_set1_ps
(
0
.
713483036
f
);
const
__m512
x
=
_mm512_fmadd_ps
(
in
,
log2_e
,
half
);
#if 1
const
__m512
y
=
_mm512_sub_ps
(
x
,
_mm512_roundscale_round_ps
(
x
,
1
,
_MM_FROUND_CUR_DIRECTION
));
#else
const
__m512
y
=
_mm512_reduce_ps
(
x
,
1
);
#endif
const
__m512
t1
=
_mm512_fmadd_ps
(
y
,
c2
,
c1
);
const
__m512
two_to_y
=
_mm512_fmadd_ps
(
y
,
t1
,
c0
);
const
__m512
exp
=
_mm512_scalef_ps
(
two_to_y
,
x
);
return
exp
;
}
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512
LIBXSMM_INTRINSICS_MM512_EXP_PS_3DTS
(
__m512
in
)
{
const
__m512
log2_e
=
_mm512_set1_ps
(
1
.
442695
f
);
const
__m512
half
=
_mm512_set1_ps
(
0
.
5
f
);
const
__m512
c3
=
_mm512_set1_ps
(
0
.
05550410
866
f
);
const
__m512
c2
=
_mm512_set1_ps
(
0
.
15697034396
f
);
const
__m512
c1
=
_mm512_set1_ps
(
0
.
49454875509
f
);
const
__m512
c0
=
_mm512_set1_ps
(
0
.
70654502287
f
);
const
__m512
x
=
_mm512_fmadd_ps
(
in
,
log2_e
,
half
);
#if 1
const
__m512
y
=
_mm512_sub_ps
(
x
,
_mm512_roundscale_round_ps
(
x
,
1
,
_MM_FROUND_CUR_DIRECTION
));
#else
const
__m512
y
=
_mm512_reduce_ps
(
x
,
1
);
#endif
const
__m512
t1
=
_mm512_fmadd_ps
(
y
,
c3
,
c2
);
const
__m512
t2
=
_mm512_fmadd_ps
(
y
,
t1
,
c1
);
const
__m512
two_to_y
=
_mm512_fmadd_ps
(
y
,
t2
,
c0
);
const
__m512
exp
=
_mm512_scalef_ps
(
two_to_y
,
x
);
return
exp
;
}
# if defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && 0
LIBXSMM_PRAGMA_OPTIMIZE_OFF
/* avoid ICE in case of symbols (-g) */
# endif
/** Generate random number in the interval [0, 1); not thread-safe.
* this is based on xoshiro128+ 1.0, e.g. http://prng.di.unimi.it/xoshiro128plus.c */
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512i
LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EPI32
(
void
)
{
const
__m512i
result
=
_mm512_add_epi32
(
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
0
),
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
3
));
const
__m512i
s
=
_mm512_slli_epi32
(
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
1
),
9
);
__m512i
t
;
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
2
)
=
_mm512_xor_epi32
(
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
2
),
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
0
));
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
3
)
=
_mm512_xor_epi32
(
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
3
),
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
1
));
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
1
)
=
_mm512_xor_epi32
(
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
1
),
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
2
));
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
0
)
=
_mm512_xor_epi32
(
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
0
),
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
3
));
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
2
)
=
_mm512_xor_epi32
(
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
2
),
s
);
t
=
_mm512_slli_epi32
(
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
3
),
11
);
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
3
)
=
_mm512_or_epi32
(
t
,
_mm512_srli_epi32
(
LIBXSMM_INTRINSICS_MM512_RNG_STATE
(
3
),
32
-
11
));
return
result
;
}
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512
LIBXSMM_INTRINSICS_MM512_RNG_PS
(
void
)
{
const
__m512i
rng_mantissa
=
_mm512_srli_epi32
(
LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EPI32
(),
9
);
const
__m512
one
=
_mm512_set1_ps
(
1
.
0
f
);
return
_mm512_sub_ps
(
_mm512_castsi512_ps
(
_mm512_or_epi32
(
_mm512_set1_epi32
(
0x3f800000
),
rng_mantissa
)),
one
);
}
/** Generate random number in the interval [0, 1); thread save, state needs to be managed by user.
* this is based on xoshiro128+ 1.0, e.g. http://prng.di.unimi.it/xoshiro128plus.c */
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512i
LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EXTSTATE_EPI32
(
unsigned
int
*
stateptr
)
{
__m512i
state_0
=
_mm512_loadu_si512
(
stateptr
);
__m512i
state_1
=
_mm512_loadu_si512
(
stateptr
+
16
);
__m512i
state_2
=
_mm512_loadu_si512
(
stateptr
+
32
);
__m512i
state_3
=
_mm512_loadu_si512
(
stateptr
+
48
);
const
__m512i
result
=
_mm512_add_epi32
(
state_0
,
state_3
);
const
__m512i
s
=
_mm512_slli_epi32
(
state_1
,
9
);
__m512i
t
;
state_2
=
_mm512_xor_epi32
(
state_2
,
state_0
);
state_3
=
_mm512_xor_epi32
(
state_3
,
state_1
);
state_1
=
_mm512_xor_epi32
(
state_1
,
state_2
);
state_0
=
_mm512_xor_epi32
(
state_0
,
state_3
);
state_2
=
_mm512_xor_epi32
(
state_2
,
s
);
_mm512_storeu_si512
(
stateptr
,
state_0
);
_mm512_storeu_si512
(
stateptr
+
16
,
state_1
);
_mm512_storeu_si512
(
stateptr
+
32
,
state_2
);
t
=
_mm512_slli_epi32
(
state_3
,
11
);
state_3
=
_mm512_or_epi32
(
t
,
_mm512_srli_epi32
(
state_3
,
32
-
11
));
_mm512_storeu_si512
(
stateptr
+
48
,
state_3
);
return
result
;
}
LIBXSMM_API_INLINE
LIBXSMM_INTRINSICS
(
LIBXSMM_X86_AVX512
)
__m512
LIBXSMM_INTRINSICS_MM512_RNG_EXTSTATE_PS
(
unsigned
int
*
stateptr
)
{
const
__m512i
rng_mantissa
=
_mm512_srli_epi32
(
LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EXTSTATE_EPI32
(
stateptr
),
9
);
const
__m512
one
=
_mm512_set1_ps
(
1
.
0
f
);
return
_mm512_sub_ps
(
_mm512_castsi512_ps
(
_mm512_or_epi32
(
_mm512_set1_epi32
(
0x3f800000
),
rng_mantissa
)),
one
);
}
# if defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && 0
LIBXSMM_PRAGMA_OPTIMIZE_ON
# endif
#endif
/*__AVX512F__*/
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(pop)
#endif
#endif
/*LIBXSMM_INTRINSICS_X86_H*/
third_party/libxsmm/include/libxsmm_macros.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_MACROS_H
#define LIBXSMM_MACROS_H
#include "libxsmm_config.h"
/** Parameters the library was built for. */
#define LIBXSMM_CACHELINE LIBXSMM_CONFIG_CACHELINE
#define LIBXSMM_ALIGNMENT LIBXSMM_CONFIG_ALIGNMENT
#define LIBXSMM_MALLOC LIBXSMM_CONFIG_MALLOC
#define LIBXSMM_ILP64 LIBXSMM_CONFIG_ILP64
#define LIBXSMM_SYNC LIBXSMM_CONFIG_SYNC
#define LIBXSMM_JIT LIBXSMM_CONFIG_JIT
/** Parameters of GEMM domain (static kernels, etc). */
#define LIBXSMM_PREFETCH LIBXSMM_CONFIG_PREFETCH
#define LIBXSMM_MAX_MNK LIBXSMM_CONFIG_MAX_MNK
#define LIBXSMM_MAX_DIM LIBXSMM_CONFIG_MAX_DIM
#define LIBXSMM_MAX_M LIBXSMM_CONFIG_MAX_M
#define LIBXSMM_MAX_N LIBXSMM_CONFIG_MAX_N
#define LIBXSMM_MAX_K LIBXSMM_CONFIG_MAX_K
#define LIBXSMM_FLAGS LIBXSMM_CONFIG_FLAGS
#define LIBXSMM_ALPHA LIBXSMM_CONFIG_ALPHA
#define LIBXSMM_BETA LIBXSMM_CONFIG_BETA
/**
* Use "make PLATFORM=1" to disable platform checks.
* The platform check is to bail-out with an error
* message for an attempt to build an upstream package
* and subsequently to list LIBXSMM as "broken" on
* that platform.
* Note: successful compilation on an unsupported
* platform is desired, but only fallback code is
* present at best.
*/
#if !defined(LIBXSMM_PLATFORM_FORCE) && 0
# define LIBXSMM_PLATFORM_FORCE
#endif
#if !defined(LIBXSMM_PLATFORM_X86) && ( \
(defined(__x86_64__) && 0 != (__x86_64__)) || \
(defined(__amd64__) && 0 != (__amd64__)) || \
(defined(_M_X64) || defined(_M_AMD64)) || \
(defined(__i386__) && 0 != (__i386__)) || \
(defined(_M_IX86)))
# define LIBXSMM_PLATFORM_X86
#endif
#if !defined(LIBXSMM_PLATFORM_AARCH64) && \
(defined(__aarch64__) || defined(__arm64__))
# define LIBXSMM_PLATFORM_AARCH64
#endif
#if !defined(LIBXSMM_PLATFORM_SUPPORTED)
# if defined(LIBXSMM_PLATFORM_X86) || defined(LIBXSMM_PLATFORM_AARCH64)
# define LIBXSMM_PLATFORM_SUPPORTED
# elif !defined(LIBXSMM_PLATFORM_FORCE)
# error LIBXSMM requires X86_64, AArch64, or compatible CPUs!
# endif
#endif
#if !defined(LIBXSMM_BITS)
# if (defined(__SIZEOF_PTRDIFF_T__) && 4 < (__SIZEOF_PTRDIFF_T__)) || \
(defined(__SIZE_MAX__) && (4294967295U < (__SIZE_MAX__))) || \
(defined(__x86_64__) && 0 != (__x86_64__)) || \
(defined(__amd64__) && 0 != (__amd64__)) || \
(defined(_M_X64) || defined(_M_AMD64)) || \
(defined(_WIN64)) || \
(defined(__powerpc64)) || \
(defined(__aarch64__))
# define LIBXSMM_UNLIMITED 0xFFFFFFFFFFFFFFFF
# define LIBXSMM_BITS 64
# elif !defined(LIBXSMM_PLATFORM_FORCE) && defined(NDEBUG)
# error LIBXSMM is only supported on 64-bit platforms!
# else
/* JIT-generated code (among other issues) is not supported! */
# define LIBXSMM_UNLIMITED 0xFFFFFFFF
# define LIBXSMM_BITS 32
# endif
#endif
#define LIBXSMM_STRINGIFY2(SYMBOL) #SYMBOL
#define LIBXSMM_STRINGIFY(SYMBOL) LIBXSMM_STRINGIFY2(SYMBOL)
#define LIBXSMM_TOSTRING(SYMBOL) LIBXSMM_STRINGIFY(SYMBOL)
#define LIBXSMM_CONCATENATE2(A, B) A##B
#define LIBXSMM_CONCATENATE3(A, B, C) LIBXSMM_CONCATENATE(LIBXSMM_CONCATENATE(A, B), C)
#define LIBXSMM_CONCATENATE4(A, B, C, D) LIBXSMM_CONCATENATE(LIBXSMM_CONCATENATE3(A, B, C), D)
#define LIBXSMM_CONCATENATE(A, B) LIBXSMM_CONCATENATE2(A, B)
#define LIBXSMM_FSYMBOL(SYMBOL) LIBXSMM_CONCATENATE(SYMBOL, _)
#define LIBXSMM_UNIQUE(NAME) LIBXSMM_CONCATENATE(NAME, __LINE__)
#define LIBXSMM_EXPAND(...) __VA_ARGS__
#define LIBXSMM_ELIDE(...)
/**
* Check given value against type-range (assertion).
* Note: allows "-1" for unsigned types.
*/
#if !defined(NDEBUG)
# define LIBXSMM_CHECK_ULLONG(VALUE) assert(-1 <= (VALUE) && (VALUE) <= ULLONG_MAX)
# define LIBXSMM_CHECK_LLONG(VALUE) assert(ULLONG_MIN <= (VALUE) && (VALUE) <= LLONG_MAX)
# define LIBXSMM_CHECK_ULONG(VALUE) assert(-1 <= (VALUE) && (VALUE) <= ULONG_MAX)
# define LIBXSMM_CHECK_LONG(VALUE) assert(LONG_MIN <= (VALUE) && (VALUE) <= LONG_MAX)
# define LIBXSMM_CHECK_USHORT(VALUE) assert(-1 <= (VALUE) && (VALUE) <= USHRT_MAX)
# define LIBXSMM_CHECK_SHORT(VALUE) assert(SHRT_MIN <= (VALUE) && (VALUE) <= SHRT_MAX)
# define LIBXSMM_CHECK_UCHAR(VALUE) assert(-1 <= (VALUE) && (VALUE) <= UCHAR_MAX)
# define LIBXSMM_CHECK_ICHAR(VALUE) assert(SCHAR_MIN <= (VALUE) && (VALUE) <= SCHAR_MAX)
# define LIBXSMM_CHECK_UINT(VALUE) assert(-1 <= (VALUE) && (VALUE) <= UINT_MAX)
# define LIBXSMM_CHECK_INT(VALUE) assert(INT_MIN <= (VALUE) && (VALUE) <= INT_MAX)
#else
# define LIBXSMM_CHECK_ULLONG(VALUE) 0
/*dummy*/
# define LIBXSMM_CHECK_LLONG(VALUE) 0
/*dummy*/
# define LIBXSMM_CHECK_ULONG(VALUE) 0
/*dummy*/
# define LIBXSMM_CHECK_LONG(VALUE) 0
/*dummy*/
# define LIBXSMM_CHECK_USHORT(VALUE) 0
/*dummy*/
# define LIBXSMM_CHECK_SHORT(VALUE) 0
/*dummy*/
# define LIBXSMM_CHECK_UCHAR(VALUE) 0
/*dummy*/
# define LIBXSMM_CHECK_ICHAR(VALUE) 0
/*dummy*/
# define LIBXSMM_CHECK_UINT(VALUE) 0
/*dummy*/
# define LIBXSMM_CHECK_INT(VALUE) 0
/*dummy*/
#endif
/**
* Perform verbose type-cast with following two advantages:
* (1) Make it easy to locate/find the type-cast.
* (2) Range-check to ensure fitting into type.
*/
#define LIBXSMM_CAST_ULLONG(VALUE) (LIBXSMM_CHECK_ULLONG(VALUE), (unsigned long long)(VALUE))
#define LIBXSMM_CAST_LLONG(VALUE) (LIBXSMM_CHECK_LLONG(VALUE), (
/*signed*/
long long)(VALUE))
#define LIBXSMM_CAST_ULONG(VALUE) (LIBXSMM_CHECK_ULONG(VALUE), (unsigned long)(VALUE))
#define LIBXSMM_CAST_LONG(VALUE) (LIBXSMM_CHECK_LONG(VALUE), (
/*signed*/
long)(VALUE))
#define LIBXSMM_CAST_USHORT(VALUE) (LIBXSMM_CHECK_USHORT(VALUE), (unsigned short)(VALUE))
#define LIBXSMM_CAST_SHORT(VALUE) (LIBXSMM_CHECK_SHORT(VALUE), (
/*signed*/
short)(VALUE))
#define LIBXSMM_CAST_UCHAR(VALUE) (LIBXSMM_CHECK_UCHAR(VALUE), (unsigned char)(VALUE))
#define LIBXSMM_CAST_ICHAR(VALUE) (LIBXSMM_CHECK_ICHAR(VALUE), (signed char)(VALUE))
#define LIBXSMM_CAST_UINT(VALUE) (LIBXSMM_CHECK_UINT(VALUE), (unsigned int)(VALUE))
#define LIBXSMM_CAST_INT(VALUE) (LIBXSMM_CHECK_INT(VALUE), (
/*signed*/
int)(VALUE))
/** Use LIBXSMM_VERSION2 instead of LIBXSMM_VERSION3, e.g., if __GNUC_PATCHLEVEL__ or __clang_patchlevel__ is zero (0). */
#define LIBXSMM_VERSION2(MAJOR, MINOR) ((MAJOR) * 10000 + (MINOR) * 100)
#define LIBXSMM_VERSION3(MAJOR, MINOR, UPDATE) (LIBXSMM_VERSION2(MAJOR, MINOR) + (UPDATE))
#define LIBXSMM_VERSION4(MAJOR, MINOR, UPDATE, PATCH) \
(((0x7F & (MAJOR)) << 24) | ((0x1F & (MINOR)) << 19) | ((0x1F & (UPDATE)) << 14) | (0x3FFF & (PATCH)))
#define LIBXSMM_VERSION41(VERSION) (((VERSION) >> 24))
#define LIBXSMM_VERSION42(VERSION) (((VERSION) >> 19) & 0x1F)
#define LIBXSMM_VERSION43(VERSION) (((VERSION) >> 14) & 0x1F)
#define LIBXSMM_VERSION44(VERSION) (((VERSION)) & 0x3FFF)
#if !defined(LIBXSMM_UNPACKED) && (defined(_CRAYC) || defined(LIBXSMM_OFFLOAD_BUILD) || \
(0 == LIBXSMM_SYNC)
/*Windows: missing pack(pop) error*/
)
# define LIBXSMM_UNPACKED
#endif
#if defined(_WIN32) && !defined(__GNUC__) && !defined(__clang__)
# define LIBXSMM_ATTRIBUTE(A) __declspec(A)
# if defined(__cplusplus)
# define LIBXSMM_INLINE_ALWAYS __forceinline
# else
# define LIBXSMM_INLINE_ALWAYS static __forceinline
# endif
# define LIBXSMM_ALIGNED(DECL, N) LIBXSMM_ATTRIBUTE(align(N)) DECL
# if !defined(LIBXSMM_UNPACKED)
# define LIBXSMM_PACKED(TYPE) LIBXSMM_PRAGMA(pack(1)) TYPE
# endif
# define LIBXSMM_CDECL __cdecl
#elif (defined(__GNUC__) || defined(__clang__) || defined(__PGI))
# define LIBXSMM_ATTRIBUTE(A) __attribute__((A))
# define LIBXSMM_INLINE_ALWAYS LIBXSMM_ATTRIBUTE(always_inline) LIBXSMM_INLINE
# define LIBXSMM_ALIGNED(DECL, N) LIBXSMM_ATTRIBUTE(aligned(N)) DECL
# if !defined(LIBXSMM_UNPACKED)
# define LIBXSMM_PACKED(TYPE) TYPE LIBXSMM_ATTRIBUTE(__packed__)
# endif
# define LIBXSMM_CDECL LIBXSMM_ATTRIBUTE(cdecl)
#else
# define LIBXSMM_ATTRIBUTE(A)
# define LIBXSMM_INLINE_ALWAYS LIBXSMM_INLINE
# define LIBXSMM_ALIGNED(DECL, N) DECL
# define LIBXSMM_CDECL
#endif
#if !defined(LIBXSMM_PACKED)
# define LIBXSMM_PACKED(TYPE) TYPE
# if !defined(LIBXSMM_UNPACKED)
# define LIBXSMM_UNPACKED
# endif
#endif
#if !defined(LIBXSMM_UNPACKED) && 0
/* no braces around EXPR */
# define LIBXSMM_PAD(EXPR) EXPR;
#endif
#if !defined(LIBXSMM_PAD)
# define LIBXSMM_PAD(EXPR)
#endif
#if defined(__INTEL_COMPILER)
# if !defined(__INTEL_COMPILER_UPDATE)
# define LIBXSMM_INTEL_COMPILER __INTEL_COMPILER
# else
# define LIBXSMM_INTEL_COMPILER (__INTEL_COMPILER + __INTEL_COMPILER_UPDATE)
# endif
#elif defined(__INTEL_COMPILER_BUILD_DATE)
# define LIBXSMM_INTEL_COMPILER ((__INTEL_COMPILER_BUILD_DATE / 10000 - 2000) * 100)
#endif
/* LIBXSMM_ATTRIBUTE_USED: mark library functions as used to avoid warning */
#if defined(__GNUC__) || defined(__clang__) || (defined(__INTEL_COMPILER) && !defined(_WIN32))
# if !defined(__cplusplus) || !defined(__clang__)
# define LIBXSMM_ATTRIBUTE_COMMON LIBXSMM_ATTRIBUTE(common)
# else
# define LIBXSMM_ATTRIBUTE_COMMON
# endif
# define LIBXSMM_ATTRIBUTE_MALLOC LIBXSMM_ATTRIBUTE(malloc)
# define LIBXSMM_ATTRIBUTE_UNUSED LIBXSMM_ATTRIBUTE(unused)
# define LIBXSMM_ATTRIBUTE_USED LIBXSMM_ATTRIBUTE(used)
#else
# if defined(_WIN32)
# define LIBXSMM_ATTRIBUTE_COMMON LIBXSMM_ATTRIBUTE(selectany)
# else
# define LIBXSMM_ATTRIBUTE_COMMON
# endif
# define LIBXSMM_ATTRIBUTE_MALLOC
# define LIBXSMM_ATTRIBUTE_UNUSED
# define LIBXSMM_ATTRIBUTE_USED
#endif
#if defined(__clang__) && !defined(__INTEL_COMPILER)
# define LIBXSMM_ATTRIBUTE_NO_SANITIZE(KIND) LIBXSMM_ATTRIBUTE(no_sanitize(LIBXSMM_STRINGIFY(KIND)))
#elif defined(__GNUC__) && LIBXSMM_VERSION2(4, 8) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) \
&& !defined(__INTEL_COMPILER)
# define LIBXSMM_ATTRIBUTE_NO_SANITIZE(KIND) LIBXSMM_ATTRIBUTE(LIBXSMM_CONCATENATE(no_sanitize_, KIND))
#else
# define LIBXSMM_ATTRIBUTE_NO_SANITIZE(KIND)
#endif
#if defined(__cplusplus)
# define LIBXSMM_VARIADIC ...
# define LIBXSMM_EXTERN extern "C"
# define LIBXSMM_EXTERN_C extern "C"
# define LIBXSMM_INLINE_KEYWORD inline
# define LIBXSMM_INLINE LIBXSMM_INLINE_KEYWORD
# if defined(__GNUC__) || defined(_CRAYC)
# define LIBXSMM_CALLER __PRETTY_FUNCTION__
# elif defined(_MSC_VER)
# define LIBXSMM_CALLER __FUNCDNAME__
# define LIBXSMM_FUNCNAME __FUNCTION__
# else
# define LIBXSMM_CALLER __FUNCNAME__
# endif
#else
/* C */
# define LIBXSMM_VARIADIC
# define LIBXSMM_EXTERN extern
# define LIBXSMM_EXTERN_C
# if defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__)
/*C99*/
# define LIBXSMM_PRAGMA(DIRECTIVE) _Pragma(LIBXSMM_STRINGIFY(DIRECTIVE))
# define LIBXSMM_CALLER __func__
# define LIBXSMM_RESTRICT restrict
# define LIBXSMM_INLINE_KEYWORD inline
# elif defined(_MSC_VER)
# define LIBXSMM_CALLER __FUNCDNAME__
# define LIBXSMM_FUNCNAME __FUNCTION__
# define LIBXSMM_INLINE_KEYWORD __inline
# define LIBXSMM_INLINE_FIXUP
# elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
# define LIBXSMM_CALLER __PRETTY_FUNCTION__
# endif
# if !defined(LIBXSMM_INLINE_KEYWORD)
# define LIBXSMM_INLINE_KEYWORD
# define LIBXSMM_INLINE_FIXUP
# endif
/* LIBXSMM_ATTRIBUTE_USED: increases compile-time of header-only by a large factor */
# define LIBXSMM_INLINE static LIBXSMM_INLINE_KEYWORD LIBXSMM_ATTRIBUTE_UNUSED
#endif
/*__cplusplus*/
#if !defined(LIBXSMM_CALLER)
# define LIBXSMM_CALLER NULL
#endif
#if !defined(LIBXSMM_FUNCNAME)
# define LIBXSMM_FUNCNAME LIBXSMM_CALLER
#endif
#if !defined(LIBXSMM_CALLER_ID)
# if defined(__GNUC__) || 1
# define LIBXSMM_CALLER_ID ((const void*)((uintptr_t)libxsmm_hash_string(LIBXSMM_CALLER)))
# else
/* assume no string-pooling (perhaps unsafe) */
# define LIBXSMM_CALLER_ID LIBXSMM_CALLER
# endif
#endif
#if defined(LIBXSMM_OFFLOAD_BUILD) && \
defined(__INTEL_OFFLOAD) && (!defined(_WIN32) || (1400 <= LIBXSMM_INTEL_COMPILER))
# define LIBXSMM_OFFLOAD(A) LIBXSMM_ATTRIBUTE(target(A))
# define LIBXSMM_NO_OFFLOAD(RTYPE, FN, ...) ((RTYPE (*)(LIBXSMM_VARIADIC))(FN))(__VA_ARGS__)
# if !defined(LIBXSMM_OFFLOAD_TARGET)
# define LIBXSMM_OFFLOAD_TARGET mic
# endif
#else
# define LIBXSMM_OFFLOAD(A)
# define LIBXSMM_NO_OFFLOAD(RTYPE, FN, ...) (FN)(__VA_ARGS__)
#endif
#define LIBXSMM_RETARGETABLE LIBXSMM_OFFLOAD(LIBXSMM_OFFLOAD_TARGET)
#if !defined(__STATIC) && !defined(_WINDLL) && (defined(_WIN32) || defined(__CYGWIN__) || defined(__MINGW32__))
# define __STATIC
#endif
/* may include Clang and other compatible compilers */
#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MINGW32__)
# define LIBXSMM_VISIBILITY_INTERNAL LIBXSMM_ATTRIBUTE(visibility("internal"))
# define LIBXSMM_VISIBILITY_HIDDEN LIBXSMM_ATTRIBUTE(visibility("hidden"))
# define LIBXSMM_VISIBILITY_PUBLIC LIBXSMM_ATTRIBUTE(visibility("default"))
#endif
#if !defined(LIBXSMM_VISIBILITY_INTERNAL)
# define LIBXSMM_VISIBILITY_INTERNAL
#endif
#if !defined(LIBXSMM_VISIBILITY_HIDDEN)
# define LIBXSMM_VISIBILITY_HIDDEN
#endif
#if !defined(LIBXSMM_VISIBILITY_PUBLIC)
# define LIBXSMM_VISIBILITY_PUBLIC
#endif
#if !defined(LIBXSMM_VISIBILITY_PRIVATE)
# define LIBXSMM_VISIBILITY_PRIVATE LIBXSMM_VISIBILITY_HIDDEN
#endif
/* Windows Dynamic Link Library (DLL) */
#if !defined(__STATIC) && (defined(_WIN32) || defined(__CYGWIN__) || defined(__MINGW32__))
# define LIBXSMM_VISIBILITY_EXPORT LIBXSMM_ATTRIBUTE(dllexport)
# define LIBXSMM_VISIBILITY_IMPORT LIBXSMM_ATTRIBUTE(dllimport)
#endif
#if !defined(LIBXSMM_VISIBILITY_EXPORT)
# define LIBXSMM_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_PUBLIC
#endif
#if !defined(LIBXSMM_VISIBILITY_IMPORT)
# define LIBXSMM_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_PUBLIC
#endif
#if defined(LIBXSMM_SOURCE_H)
/* header-only mode */
# define LIBXSMM_API_VISIBILITY_EXPORT
# define LIBXSMM_API_VISIBILITY_IMPORT
# define LIBXSMM_API_VISIBILITY_INTERN
# define LIBXSMM_API_COMMON LIBXSMM_RETARGETABLE LIBXSMM_ATTRIBUTE_COMMON
# define LIBXSMM_API_TARGET LIBXSMM_API_INLINE
# define LIBXSMM_API_EXTERN LIBXSMM_EXTERN_C
#else
/* classic ABI */
# if defined(LIBXSMM_BUILD_EXT)
# define LIBXSMM_API_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_IMPORT
# define LIBXSMM_API_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_EXPORT
# define LIBXSMM_API_VISIBILITY_INTERN LIBXSMM_VISIBILITY_PRIVATE
# elif defined(LIBXSMM_BUILD)
# define LIBXSMM_API_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_EXPORT
# define LIBXSMM_API_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_IMPORT
# define LIBXSMM_API_VISIBILITY_INTERN LIBXSMM_VISIBILITY_PRIVATE
# else
/* import */
# define LIBXSMM_API_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_IMPORT
# define LIBXSMM_API_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_IMPORT
# define LIBXSMM_API_VISIBILITY_INTERN
# endif
# define LIBXSMM_API_COMMON LIBXSMM_RETARGETABLE
# define LIBXSMM_API_TARGET LIBXSMM_RETARGETABLE
# define LIBXSMM_API_EXTERN LIBXSMM_EXTERN
#endif
#define LIBXSMM_API_VISIBILITY(VISIBILITY) LIBXSMM_CONCATENATE(LIBXSMM_API_VISIBILITY_, VISIBILITY)
#define LIBXSMM_APIVAR(DECL, VISIBILITY, EXTERN) EXTERN LIBXSMM_API_COMMON LIBXSMM_API_VISIBILITY(VISIBILITY) DECL
#define LIBXSMM_API_INLINE LIBXSMM_INLINE LIBXSMM_RETARGETABLE
#define LIBXSMM_API_DEF
#if (!defined(__INTEL_COMPILER) || !defined(_WIN32))
#define LIBXSMM_APIVAR_ALIGNED(DECL, VISIBILITY) LIBXSMM_ALIGNED(LIBXSMM_APIVAR(DECL, VISIBILITY, LIBXSMM_API_DEF), LIBXSMM_CONFIG_CACHELINE)
#else
#define LIBXSMM_APIVAR_ALIGNED(DECL, VISIBILITY) LIBXSMM_APIVAR(DECL, VISIBILITY, LIBXSMM_API_DEF)
#endif
/** Public variable declaration (without definition) located in header file. */
#define LIBXSMM_APIVAR_PUBLIC(DECL) LIBXSMM_APIVAR(DECL, EXPORT, LIBXSMM_API_EXTERN)
/** Public variable definition (complements declaration) located in source file. */
#define LIBXSMM_APIVAR_PUBLIC_DEF(DECL) LIBXSMM_APIVAR_ALIGNED(DECL, EXPORT)
/** Private variable declaration (without definition) located in header file. */
#define LIBXSMM_APIVAR_PRIVATE(DECL) LIBXSMM_APIVAR(DECL, INTERN, LIBXSMM_API_EXTERN)
/** Private variable definition (complements declaration) located in source file. */
#define LIBXSMM_APIVAR_PRIVATE_DEF(DECL) LIBXSMM_APIVAR_ALIGNED(DECL, INTERN)
/** Private variable (declaration and definition) located in source file. */
#define LIBXSMM_APIVAR_DEFINE(DECL) LIBXSMM_APIVAR_PRIVATE(DECL); LIBXSMM_APIVAR_PRIVATE_DEF(DECL)
/** Function decoration used for private functions. */
#define LIBXSMM_API_INTERN LIBXSMM_API_EXTERN LIBXSMM_API_TARGET LIBXSMM_API_VISIBILITY(INTERN)
/** Function decoration used for public functions of LIBXSMMext library. */
#define LIBXSMM_APIEXT LIBXSMM_API_EXTERN LIBXSMM_API_TARGET LIBXSMM_API_VISIBILITY(IMPORT)
/** Function decoration used for public functions of LIBXSMM library. */
#define LIBXSMM_API LIBXSMM_API_EXTERN LIBXSMM_API_TARGET LIBXSMM_API_VISIBILITY(EXPORT)
#if !defined(LIBXSMM_RESTRICT)
# if ((defined(__GNUC__) && !defined(__CYGWIN32__)) || defined(LIBXSMM_INTEL_COMPILER)) && !defined(_WIN32)
# define LIBXSMM_RESTRICT __restrict__
# elif defined(_MSC_VER) || defined(LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_RESTRICT __restrict
# else
# define LIBXSMM_RESTRICT
# endif
#endif
/*LIBXSMM_RESTRICT*/
#if !defined(LIBXSMM_PRAGMA)
# if defined(LIBXSMM_INTEL_COMPILER) || defined(_MSC_VER)
# define LIBXSMM_PRAGMA(DIRECTIVE) __pragma(LIBXSMM_EXPAND(DIRECTIVE))
# else
# define LIBXSMM_PRAGMA(DIRECTIVE)
# endif
#endif
/*LIBXSMM_PRAGMA*/
#if !defined(LIBXSMM_OPENMP_SIMD) && (defined(_OPENMP) && (201307 <= _OPENMP
/*v4.0*/
))
# if defined(LIBXSMM_INTEL_COMPILER)
# if (1500 <= LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_OPENMP_SIMD
# endif
# elif defined(__GNUC__)
# if LIBXSMM_VERSION2(4, 9) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)
# define LIBXSMM_OPENMP_SIMD
# endif
# else
# define LIBXSMM_OPENMP_SIMD
# endif
#endif
#if !defined(LIBXSMM_INTEL_COMPILER) || (LIBXSMM_INTEL_COMPILER < 9900)
# if defined(LIBXSMM_OPENMP_SIMD)
# define LIBXSMM_PRAGMA_SIMD_REDUCTION(EXPRESSION) LIBXSMM_PRAGMA(omp simd reduction(EXPRESSION))
# define LIBXSMM_PRAGMA_SIMD_COLLAPSE(N) LIBXSMM_PRAGMA(omp simd collapse(N))
# define LIBXSMM_PRAGMA_SIMD_PRIVATE(...) LIBXSMM_PRAGMA(omp simd private(__VA_ARGS__))
# define LIBXSMM_PRAGMA_SIMD LIBXSMM_PRAGMA(omp simd)
# elif defined(__INTEL_COMPILER)
# define LIBXSMM_PRAGMA_SIMD_REDUCTION(EXPRESSION) LIBXSMM_PRAGMA(simd reduction(EXPRESSION))
# define LIBXSMM_PRAGMA_SIMD_COLLAPSE(N) LIBXSMM_PRAGMA(simd collapse(N))
# define LIBXSMM_PRAGMA_SIMD_PRIVATE(...) LIBXSMM_PRAGMA(simd private(__VA_ARGS__))
# define LIBXSMM_PRAGMA_SIMD LIBXSMM_PRAGMA(simd)
# endif
#endif
#if !defined(LIBXSMM_PRAGMA_SIMD)
# define LIBXSMM_PRAGMA_SIMD_REDUCTION(EXPRESSION)
# define LIBXSMM_PRAGMA_SIMD_COLLAPSE(N)
# define LIBXSMM_PRAGMA_SIMD_PRIVATE(...)
# define LIBXSMM_PRAGMA_SIMD
#endif
#if defined(__INTEL_COMPILER)
# define LIBXSMM_PRAGMA_NONTEMPORAL(...) LIBXSMM_PRAGMA(vector nontemporal(__VA_ARGS__))
# define LIBXSMM_PRAGMA_VALIGNED LIBXSMM_PRAGMA(vector aligned)
# define LIBXSMM_PRAGMA_NOVECTOR LIBXSMM_PRAGMA(novector)
# define LIBXSMM_PRAGMA_FORCEINLINE LIBXSMM_PRAGMA(forceinline)
# define LIBXSMM_PRAGMA_LOOP_COUNT(MIN, MAX, AVG) LIBXSMM_PRAGMA(loop_count min=MIN max=MAX avg=AVG)
# define LIBXSMM_PRAGMA_UNROLL_AND_JAM(N) LIBXSMM_PRAGMA(unroll_and_jam(N))
# define LIBXSMM_PRAGMA_UNROLL_N(N) LIBXSMM_PRAGMA(unroll(N))
# define LIBXSMM_PRAGMA_UNROLL LIBXSMM_PRAGMA(unroll)
# define LIBXSMM_PRAGMA_VALIGNED_VAR(A) LIBXSMM_ASSUME_ALIGNED(A, LIBXSMM_ALIGNMENT);
/*# define LIBXSMM_UNUSED(VARIABLE) LIBXSMM_PRAGMA(unused(VARIABLE))*/
#else
# if defined(LIBXSMM_OPENMP_SIMD) && (201811 <= _OPENMP
/*v5.0*/
)
# define LIBXSMM_PRAGMA_NONTEMPORAL(...) LIBXSMM_PRAGMA(omp simd nontemporal(__VA_ARGS__))
# else
# define LIBXSMM_PRAGMA_NONTEMPORAL(...)
# endif
# if defined(__clang__)
# define LIBXSMM_PRAGMA_VALIGNED_VAR(A)
# define LIBXSMM_PRAGMA_VALIGNED
# define LIBXSMM_PRAGMA_NOVECTOR LIBXSMM_PRAGMA(clang loop vectorize(disable))
# define LIBXSMM_PRAGMA_FORCEINLINE
# define LIBXSMM_PRAGMA_LOOP_COUNT(MIN, MAX, AVG) LIBXSMM_PRAGMA(unroll(AVG))
# define LIBXSMM_PRAGMA_UNROLL_AND_JAM(N) LIBXSMM_PRAGMA(unroll(N))
# define LIBXSMM_PRAGMA_UNROLL_N(N) LIBXSMM_PRAGMA(unroll(N))
# define LIBXSMM_PRAGMA_UNROLL LIBXSMM_PRAGMA_UNROLL_N(4)
# else
# define LIBXSMM_PRAGMA_VALIGNED_VAR(A)
# define LIBXSMM_PRAGMA_VALIGNED
# define LIBXSMM_PRAGMA_NOVECTOR
# define LIBXSMM_PRAGMA_FORCEINLINE
# define LIBXSMM_PRAGMA_LOOP_COUNT(MIN, MAX, AVG)
# define LIBXSMM_PRAGMA_UNROLL_AND_JAM(N)
# define LIBXSMM_PRAGMA_UNROLL
# endif
#endif
#if !defined(LIBXSMM_PRAGMA_UNROLL_N)
# if defined(__GNUC__) && (LIBXSMM_VERSION2(8, 3) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))
# define LIBXSMM_PRAGMA_UNROLL_N(N) LIBXSMM_PRAGMA(GCC unroll N)
# else
# define LIBXSMM_PRAGMA_UNROLL_N(N)
# endif
#endif
#if defined(LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_PRAGMA_OPTIMIZE_OFF LIBXSMM_PRAGMA(optimize("", off))
# define LIBXSMM_PRAGMA_OPTIMIZE_ON LIBXSMM_PRAGMA(optimize("", on))
#elif defined(__clang__)
# define LIBXSMM_PRAGMA_OPTIMIZE_OFF LIBXSMM_PRAGMA(clang optimize off)
# define LIBXSMM_PRAGMA_OPTIMIZE_ON LIBXSMM_PRAGMA(clang optimize on)
#elif defined(__GNUC__)
# define LIBXSMM_PRAGMA_OPTIMIZE_OFF LIBXSMM_PRAGMA(GCC push_options) LIBXSMM_PRAGMA(GCC optimize("O0"))
# define LIBXSMM_PRAGMA_OPTIMIZE_ON LIBXSMM_PRAGMA(GCC pop_options)
#else
# define LIBXSMM_PRAGMA_OPTIMIZE_OFF
# define LIBXSMM_PRAGMA_OPTIMIZE_ON
#endif
#if defined(_OPENMP) && (200805 <= _OPENMP
/*v3.0*/
) \
&& defined(NDEBUG)
/* CCE complains for debug builds */
# define LIBXSMM_OPENMP_COLLAPSE(N) collapse(N)
#else
# define LIBXSMM_OPENMP_COLLAPSE(N)
#endif
/** LIBXSMM_UP2POT rounds up to the next power of two (POT). */
#define LIBXSMM_UP2POT_01(N) ((N) | ((N) >> 1))
#define LIBXSMM_UP2POT_02(N) (LIBXSMM_UP2POT_01(N) | (LIBXSMM_UP2POT_01(N) >> 2))
#define LIBXSMM_UP2POT_04(N) (LIBXSMM_UP2POT_02(N) | (LIBXSMM_UP2POT_02(N) >> 4))
#define LIBXSMM_UP2POT_08(N) (LIBXSMM_UP2POT_04(N) | (LIBXSMM_UP2POT_04(N) >> 8))
#define LIBXSMM_UP2POT_16(N) (LIBXSMM_UP2POT_08(N) | (LIBXSMM_UP2POT_08(N) >> 16))
#define LIBXSMM_UP2POT_32(N) (LIBXSMM_UP2POT_16(N) | (LIBXSMM_UP2POT_16(N) >> 32))
#define LIBXSMM_UP2POT(N) (LIBXSMM_UP2POT_32((unsigned long long)(N) - LIBXSMM_MIN(1, N)) + LIBXSMM_MIN(1, N))
#define LIBXSMM_LO2POT(N) (LIBXSMM_UP2POT_32((unsigned long long)(N) >> 1) + LIBXSMM_MIN(1, N))
#define LIBXSMM_UPDIV(N, MULT) (((N) + ((MULT) - 1)) / (MULT))
#define LIBXSMM_UP(N, MULT) (LIBXSMM_UPDIV(N, MULT) * (MULT))
#define LIBXSMM_UP2(N, NPOT) (((N) + ((NPOT) - 1)) & ~((NPOT) - 1))
#define LIBXSMM_ABS(A) (0 <= (A) ? (A) : -(A))
#define LIBXSMM_MIN(A, B) ((A) < (B) ? (A) : (B))
#define LIBXSMM_MAX(A, B) ((A) < (B) ? (B) : (A))
#define LIBXSMM_MOD(A, N) ((A) % (N))
#define LIBXSMM_MOD2(A, NPOT) ((A) & ((NPOT) - 1))
#define LIBXSMM_DELTA(T0, T1) ((T0) < (T1) ? ((T1) - (T0)) : ((T0) - (T1)))
#define LIBXSMM_CLMP(VALUE, LO, HI) ((LO) < (VALUE) ? ((VALUE) <= (HI) ? (VALUE) : LIBXSMM_MIN(VALUE, HI)) : LIBXSMM_MAX(LO, VALUE))
#define LIBXSMM_SIZEOF(START, LAST) (((const char*)(LAST)) - ((const char*)(START)) + sizeof(*LAST))
#define LIBXSMM_FEQ(A, B) ((A) == (B))
#define LIBXSMM_NEQ(A, B) ((A) != (B))
#define LIBXSMM_ISPOT(A) (0 != (A) && !((A) & ((A) - 1)))
#define LIBXSMM_ISWAP(A, B) (((A) ^= (B)), ((B) ^= (A)), ((A) ^= (B)))
#define LIBXSMM_ISNAN(A) LIBXSMM_NEQ(A, A)
#define LIBXSMM_NOTNAN(A) LIBXSMM_FEQ(A, A)
#define LIBXSMM_ROUNDX(TYPE, A) ((TYPE)((long long)(0 <= (A) ? ((double)(A) + 0.5) : ((double)(A) - 0.5))))
#define LIBXSMM_CONST_VOID_PTR(A) *((const void**)&(A))
/** Makes some functions available independent of C99 support. */
#if defined(__STDC_VERSION__) && (199901L
/*C99*/
<= __STDC_VERSION__)
# if defined(__PGI)
# define LIBXSMM_POWF(A, B) ((float)pow((float)(A), (float)(B)))
# else
# define LIBXSMM_POWF(A, B) powf(A, B)
# endif
# define LIBXSMM_FREXPF(A, B) frexpf(A, B)
# define LIBXSMM_ROUNDF(A) roundf(A)
# define LIBXSMM_ROUND(A) round(A)
# define LIBXSMM_TANHF(A) tanhf(A)
# define LIBXSMM_SQRTF(A) sqrtf(A)
# define LIBXSMM_EXP2F(A) exp2f(A)
# define LIBXSMM_LOG2F(A) log2f(A)
# define LIBXSMM_ERFF(A) erff(A)
# define LIBXSMM_EXP2(A) exp2(A)
# define LIBXSMM_LOG2(A) log2(A)
# define LIBXSMM_EXPF(A) expf(A)
# define LIBXSMM_LOGF(A) logf(A)
#else
# define LIBXSMM_POWF(A, B) ((float)pow((float)(A), (float)(B)))
# define LIBXSMM_FREXPF(A, B) ((float)frexp((float)(A), B))
# define LIBXSMM_ROUNDF(A) LIBXSMM_ROUNDX(float, A)
# define LIBXSMM_ROUND(A) LIBXSMM_ROUNDX(double, A)
# define LIBXSMM_TANHF(A) ((float)tanh((float)(A)))
# define LIBXSMM_SQRTF(A) ((float)sqrt((float)(A)))
# define LIBXSMM_EXP2F(A) LIBXSMM_POWF(2, A)
# define LIBXSMM_LOG2F(A) ((float)LIBXSMM_LOG2((float)(A)))
# define LIBXSMM_ERFF(A) ((float)erf((float)(A)))
# define LIBXSMM_EXP2(A) pow(2.0, A)
# define LIBXSMM_LOG2(A) (log(A) * (1.0 / (M_LN2)))
# define LIBXSMM_EXPF(A) ((float)exp((float)(A)))
# define LIBXSMM_LOGF(A) ((float)log((float)(A)))
#endif
#if defined(LIBXSMM_INTEL_COMPILER)
# if (1700 <= LIBXSMM_INTEL_COMPILER)
# define LIBXSMM_ASSUME(EXPRESSION) __assume(EXPRESSION)
# else
# define LIBXSMM_ASSUME(EXPRESSION) assert(EXPRESSION)
# endif
#elif defined(_MSC_VER)
# define LIBXSMM_ASSUME(EXPRESSION) __assume(EXPRESSION)
#elif defined(__GNUC__) && !defined(_CRAYC) && (LIBXSMM_VERSION2(4, 5) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))
# define LIBXSMM_ASSUME(EXPRESSION) do { if (!(EXPRESSION)) __builtin_unreachable(); } while(0)
#else
# define LIBXSMM_ASSUME(EXPRESSION) assert(EXPRESSION)
#endif
#if defined(__INTEL_COMPILER)
# define LIBXSMM_ASSUME_ALIGNED(A, N) __assume_aligned(A, N)
#else
# define LIBXSMM_ASSUME_ALIGNED(A, N) assert(0 == ((uintptr_t)(A)) % (N))
#endif
#define LIBXSMM_ALIGN(POINTER, ALIGNMENT
/*POT*/
) ((POINTER) + (LIBXSMM_UP2((uintptr_t)(POINTER), ALIGNMENT) - ((uintptr_t)(POINTER))) / sizeof(*(POINTER)))
#define LIBXSMM_FOLD2(POINTER, ALIGNMENT, NPOT) LIBXSMM_MOD2(((uintptr_t)(POINTER) / (ALIGNMENT)), NPOT)
#if defined(_MSC_VER) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER)
/* account for incorrect handling of __VA_ARGS__ */
# define LIBXSMM_SELECT_ELEMENT(INDEX1
/*one-based*/
, ...
/*elements*/
) LIBXSMM_CONCATENATE(LIBXSMM_SELECT_ELEMENT_, INDEX1)LIBXSMM_EXPAND((__VA_ARGS__))
#else
# define LIBXSMM_SELECT_ELEMENT(INDEX1
/*one-based*/
, ...
/*elements*/
) LIBXSMM_CONCATENATE(LIBXSMM_SELECT_ELEMENT_, INDEX1)(__VA_ARGS__)
#endif
#define LIBXSMM_SELECT_ELEMENT_1(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E0
#define LIBXSMM_SELECT_ELEMENT_2(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E1
#define LIBXSMM_SELECT_ELEMENT_3(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E2
#define LIBXSMM_SELECT_ELEMENT_4(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E3
#define LIBXSMM_SELECT_ELEMENT_5(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E4
#define LIBXSMM_SELECT_ELEMENT_6(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E5
#define LIBXSMM_SELECT_ELEMENT_7(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E6
#define LIBXSMM_SELECT_ELEMENT_8(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E7
#define LIBXSMM_SELECT_ELEMENT_9(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E8
#define LIBXSMM_SELECT_ELEMENT_10(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E9
#define LIBXSMM_SELECT_HEAD_AUX(A, ...) (A)
#define LIBXSMM_SELECT_HEAD(...) LIBXSMM_EXPAND(LIBXSMM_SELECT_HEAD_AUX(__VA_ARGS__, 0
/*dummy*/
))
#define LIBXSMM_SELECT_TAIL(A, ...) __VA_ARGS__
/**
* For VLAs, check EXACTLY for C99 since a C11-conforming compiler may not provide VLAs.
* However, some compilers (Intel) may signal support for VLA even with strict ANSI (C89).
* To ultimately disable VLA-support, define LIBXSMM_NO_VLA (make VLA=0).
* VLA-support is signaled by LIBXSMM_VLA.
*/
#if !defined(LIBXSMM_VLA) && !defined(LIBXSMM_NO_VLA) && !defined(__PGI) && ( \
(defined(__STDC_VERSION__) && (199901L
/*C99*/
== __STDC_VERSION__ || (!defined(__STDC_NO_VLA__) && 199901L
/*C99*/
< __STDC_VERSION__))) || \
(defined(__GNUC__) && LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) && !defined(__STRICT_ANSI__) && !defined(__cplusplus)) || \
(defined(LIBXSMM_INTEL_COMPILER) && !defined(_WIN32) && !defined(__cplusplus)) || \
(defined(__INTEL_COMPILER) && !defined(_WIN32)))
# define LIBXSMM_VLA
#endif
/**
* LIBXSMM_INDEX1 calculates the linear address for a given set of (multiple) indexes/bounds.
* Syntax: LIBXSMM_INDEX1(<ndims>, <i0>, ..., <i(ndims-1)>, <s1>, ..., <s(ndims-1)>).
* Please note that the leading dimension (s0) is omitted in the above syntax!
* TODO: support leading dimension (pitch/stride).
*/
#if defined(_MSC_VER) && !defined(__clang__)
/* account for incorrect handling of __VA_ARGS__ */
# define LIBXSMM_INDEX1(NDIMS, ...) LIBXSMM_CONCATENATE(LIBXSMM_INDEX1_, NDIMS)LIBXSMM_EXPAND((__VA_ARGS__))
#else
# define LIBXSMM_INDEX1(NDIMS, ...) LIBXSMM_CONCATENATE(LIBXSMM_INDEX1_, NDIMS)(__VA_ARGS__)
#endif
#define LIBXSMM_INDEX1_1(...) ((size_t)LIBXSMM_SELECT_HEAD(__VA_ARGS__))
#define LIBXSMM_INDEX1_2(I0, I1, S1) (LIBXSMM_INDEX1_1(I0) * ((size_t)S1) + (size_t)I1)
#define LIBXSMM_INDEX1_3(I0, I1, I2, S1, S2) (LIBXSMM_INDEX1_2(I0, I1, S1) * ((size_t)S2) + (size_t)I2)
#define LIBXSMM_INDEX1_4(I0, I1, I2, I3, S1, S2, S3) (LIBXSMM_INDEX1_3(I0, I1, I2, S1, S2) * ((size_t)S3) + (size_t)I3)
#define LIBXSMM_INDEX1_5(I0, I1, I2, I3, I4, S1, S2, S3, S4) (LIBXSMM_INDEX1_4(I0, I1, I2, I3, S1, S2, S3) * ((size_t)S4) + (size_t)I4)
#define LIBXSMM_INDEX1_6(I0, I1, I2, I3, I4, I5, S1, S2, S3, S4, S5) (LIBXSMM_INDEX1_5(I0, I1, I2, I3, I4, S1, S2, S3, S4) * ((size_t)S5) + (size_t)I5)
#define LIBXSMM_INDEX1_7(I0, I1, I2, I3, I4, I5, I6, S1, S2, S3, S4, S5, S6) (LIBXSMM_INDEX1_6(I0, I1, I2, I3, I4, I5, S1, S2, S3, S4, S5) * ((size_t)S6) + (size_t)I6)
#define LIBXSMM_INDEX1_8(I0, I1, I2, I3, I4, I5, I6, I7, S1, S2, S3, S4, S5, S6, S7) (LIBXSMM_INDEX1_7(I0, I1, I2, I3, I4, I5, I6, S1, S2, S3, S4, S5, S6) * ((size_t)S7) + (size_t)I7)
#define LIBXSMM_INDEX1_9(I0, I1, I2, I3, I4, I5, I6, I7, I8, S1, S2, S3, S4, S5, S6, S7, S8) (LIBXSMM_INDEX1_8(I0, I1, I2, I3, I4, I5, I6, I7, S1, S2, S3, S4, S5, S6, S7) * ((size_t)S8) + (size_t)I8)
#define LIBXSMM_INDEX1_10(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, S1, S2, S3, S4, S5, S6, S7, S8, S9) (LIBXSMM_INDEX1_9(I0, I1, I2, I3, I4, I5, I6, I7, I8, S1, S2, S3, S4, S5, S6, S7, S8) * ((size_t)S9) + (size_t)I9)
/**
* LIBXSMM_VLA_DECL declares an array according to the given set of (multiple) bounds.
* Syntax: LIBXSMM_VLA_DECL(<ndims>, <elem-type>, <var-name>, <init>, <s1>, ..., <s(ndims-1)>).
* The element type can be "const" or otherwise qualified; initial value must be (const)element-type*.
* Please note that the syntax is similar to LIBXSMM_INDEX1, and the leading dimension (s0) is omitted!
*
* LIBXSMM_VLA_ACCESS gives the array element according to the given set of (multiple) indexes/bounds.
* Syntax: LIBXSMM_VLA_ACCESS(<ndims>, <array>, <i0>, ..., <i(ndims-1)>, <s1>, ..., <s(ndims-1)>).
* Please note that the syntax is similar to LIBXSMM_INDEX1, and the leading dimension (s0) is omitted!
*/
#if !defined(LIBXSMM_VLA_POSTFIX)
# define LIBXSMM_VLA_POSTFIX _
#endif
#if defined(LIBXSMM_VLA)
LIBXSMM_API_INLINE
int
libxsmm_nonconst_int
(
int
i
)
{
return
i
;
}
# define LIBXSMM_VLA_ACCESS(NDIMS, ARRAY, ...) LIBXSMM_VLA_ACCESS_ND(NDIMS, LIBXSMM_CONCATENATE(ARRAY, LIBXSMM_VLA_POSTFIX), LIBXSMM_VLA_ACCESS_SINK, __VA_ARGS__)
# define LIBXSMM_VLA_ACCESS_SINK(S) + 0 * (S)
# define LIBXSMM_VLA_ACCESS_NONCONST(I) libxsmm_nonconst_int(I)
# define LIBXSMM_VLA_ACCESS_ND(NDIMS, ARRAY, XY, ...) LIBXSMM_CONCATENATE3(LIBXSMM_VLA_ACCESS_, NDIMS, D)(ARRAY, XY, __VA_ARGS__)
# define LIBXSMM_VLA_ACCESS_0D(ARRAY, XY, ...) (ARRAY)
/*scalar*/
# define LIBXSMM_VLA_ACCESS_1D(ARRAY, XY, ...) ((ARRAY)[LIBXSMM_VLA_ACCESS_NONCONST(LIBXSMM_SELECT_HEAD(__VA_ARGS__))])
# define LIBXSMM_VLA_ACCESS_2D(ARRAY, XY, I0, I1, ...) (((ARRAY) XY(__VA_ARGS__))[I0][LIBXSMM_VLA_ACCESS_NONCONST(I1)])
# define LIBXSMM_VLA_ACCESS_3D(ARRAY, XY, I0, I1, I2, S1, ...) (((ARRAY) XY(S1) XY(__VA_ARGS__))[I0][I1][LIBXSMM_VLA_ACCESS_NONCONST(I2)])
# define LIBXSMM_VLA_ACCESS_4D(ARRAY, XY, I0, I1, I2, I3, S1, S2, ...) (((ARRAY) XY(S1) XY(S2) XY(__VA_ARGS__))[I0][I1][I2][LIBXSMM_VLA_ACCESS_NONCONST(I3)])
# define LIBXSMM_VLA_ACCESS_5D(ARRAY, XY, I0, I1, I2, I3, I4, S1, S2, S3, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(__VA_ARGS__))[I0][I1][I2][I3][LIBXSMM_VLA_ACCESS_NONCONST(I4)])
# define LIBXSMM_VLA_ACCESS_6D(ARRAY, XY, I0, I1, I2, I3, I4, I5, S1, S2, S3, S4, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][LIBXSMM_VLA_ACCESS_NONCONST(I5)])
# define LIBXSMM_VLA_ACCESS_7D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, S1, S2, S3, S4, S5, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][LIBXSMM_VLA_ACCESS_NONCONST(I6)])
# define LIBXSMM_VLA_ACCESS_8D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, I7, S1, S2, S3, S4, S5, S6, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(S6) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][I6][LIBXSMM_VLA_ACCESS_NONCONST(I7)])
# define LIBXSMM_VLA_ACCESS_9D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, I7, I8, S1, S2, S3, S4, S5, S6, S7, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(S6) XY(S7) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][I6][I7][LIBXSMM_VLA_ACCESS_NONCONST(I8)])
# define LIBXSMM_VLA_ACCESS_10D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, S1, S2, S3, S4, S5, S6, S7, S8, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(S6) XY(S7) XY(S8) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][I6][I7][I8][LIBXSMM_VLA_ACCESS_NONCONST(I9)])
# define LIBXSMM_VLA_DECL(NDIMS, ELEMENT_TYPE, ARRAY_VAR, ...
/*initial value, and bounds*/
) \
ELEMENT_TYPE LIBXSMM_VLA_ACCESS_ND(LIBXSMM_SELECT_ELEMENT(NDIMS, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), *LIBXSMM_RESTRICT LIBXSMM_CONCATENATE(ARRAY_VAR, LIBXSMM_VLA_POSTFIX), \
LIBXSMM_ELIDE, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)
/*bounds*/
, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)
/*dummy*/
) = \
(ELEMENT_TYPE LIBXSMM_VLA_ACCESS_ND(LIBXSMM_SELECT_ELEMENT(NDIMS, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), *, \
LIBXSMM_ELIDE, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)
/*bounds*/
, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)
/*dummy*/
))LIBXSMM_SELECT_HEAD(__VA_ARGS__)
#else
/* calculate linear index */
# define LIBXSMM_VLA_ACCESS(NDIMS, ARRAY, ...) LIBXSMM_CONCATENATE(ARRAY, LIBXSMM_VLA_POSTFIX)[LIBXSMM_INDEX1(NDIMS, __VA_ARGS__)]
# define LIBXSMM_VLA_DECL(NDIMS, ELEMENT_TYPE, ARRAY_VAR, ...
/*initial value, and bounds*/
) \
ELEMENT_TYPE *LIBXSMM_RESTRICT LIBXSMM_CONCATENATE(ARRAY_VAR, LIBXSMM_VLA_POSTFIX) =
/*(ELEMENT_TYPE*)*/
LIBXSMM_SELECT_HEAD(__VA_ARGS__) \
+ 0 * LIBXSMM_INDEX1(NDIMS, LIBXSMM_SELECT_TAIL(__VA_ARGS__, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)))
/* dummy-shift to "sink" unused arguments */
#endif
/** Access an array of TYPE with Byte-measured stride. */
#define LIBXSMM_ACCESS(TYPE, ARRAY, STRIDE) (*(TYPE*)((char*)(ARRAY) + (STRIDE)))
#if !defined(LIBXSMM_UNUSED)
# if 0
# define LIBXSMM_UNUSED(VARIABLE) LIBXSMM_PRAGMA(unused(VARIABLE))
# else
# define LIBXSMM_UNUSED(VARIABLE) (void)(VARIABLE)
# endif
#endif
#if !defined(NDEBUG)
# define LIBXSMM_UNUSED_DEBUG(VARIABLE) LIBXSMM_UNUSED(VARIABLE)
#else
# define LIBXSMM_UNUSED_DEBUG(VARIABLE)
#endif
#if defined(_OPENMP)
# define LIBXSMM_PRAGMA_OMP(...) LIBXSMM_PRAGMA(omp __VA_ARGS__)
# if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
# define LIBXSMM_OMP_VAR(A) LIBXSMM_UNUSED(A)
/* suppress warning about "unused" variable */
# elif defined(__clang__)
# define LIBXSMM_OMP_VAR(A) (A) = 0
# else
# define LIBXSMM_OMP_VAR(A)
# endif
#else
# define LIBXSMM_PRAGMA_OMP(...)
# define LIBXSMM_OMP_VAR(A)
#endif
#if defined(LIBXSMM_BUILD) && (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__) && !defined(__MINGW32__)
# define LIBXSMM_ATTRIBUTE_WEAK_IMPORT LIBXSMM_ATTRIBUTE(weak_import)
# define LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE(weak)
#else
# define LIBXSMM_ATTRIBUTE_WEAK
# define LIBXSMM_ATTRIBUTE_WEAK_IMPORT
#endif
#if !defined(LIBXSMM_NO_CTOR) && !defined(LIBXSMM_CTOR) && \
(defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__)) && \
(defined(LIBXSMM_BUILD) && !defined(__STATIC)) && \
(defined(__GNUC__) || defined(__clang__))
# define LIBXSMM_ATTRIBUTE_CTOR LIBXSMM_ATTRIBUTE(constructor)
# define LIBXSMM_ATTRIBUTE_DTOR LIBXSMM_ATTRIBUTE(destructor)
# define LIBXSMM_CTOR
#else
# define LIBXSMM_ATTRIBUTE_CTOR
# define LIBXSMM_ATTRIBUTE_DTOR
#endif
#if defined(__GNUC__) && !defined(__PGI) && !defined(__ibmxl__)
# define LIBXSMM_ATTRIBUTE_NO_TRACE LIBXSMM_ATTRIBUTE(no_instrument_function)
#else
# define LIBXSMM_ATTRIBUTE_NO_TRACE
#endif
#if defined(__GNUC__)
# define LIBXSMM_MAY_ALIAS LIBXSMM_ATTRIBUTE(__may_alias__)
#else
# define LIBXSMM_MAY_ALIAS
#endif
#if !defined(LIBXSMM_MKTEMP_PATTERN)
# define LIBXSMM_MKTEMP_PATTERN "XXXXXX"
#endif
/** Below group is to fix-up some platform/compiler specifics. */
#if defined(_WIN32)
# if !defined(_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES)
# define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1
# endif
# if !defined(_CRT_SECURE_NO_DEPRECATE)
# define _CRT_SECURE_NO_DEPRECATE 1
# endif
# if !defined(_USE_MATH_DEFINES)
# define _USE_MATH_DEFINES 1
# endif
# if !defined(WIN32_LEAN_AND_MEAN)
# define WIN32_LEAN_AND_MEAN 1
# endif
# if !defined(NOMINMAX)
# define NOMINMAX 1
# endif
# if defined(__INTEL_COMPILER) && (190023506 <= _MSC_FULL_VER)
# define __builtin_huge_val() HUGE_VAL
# define __builtin_huge_valf() HUGE_VALF
# define __builtin_nan nan
# define __builtin_nanf nanf
# define __builtin_nans nan
# define __builtin_nansf nanf
# if defined(__cplusplus)
# define _CMATH_
# endif
# endif
#endif
#if !defined(_GNU_SOURCE) && defined(LIBXSMM_BUILD)
# define _GNU_SOURCE
#endif
#if !defined(__STDC_FORMAT_MACROS)
# define __STDC_FORMAT_MACROS
#endif
#if defined(__clang__) && !defined(__extern_always_inline)
# define __extern_always_inline LIBXSMM_INLINE
#endif
#if defined(LIBXSMM_INLINE_FIXUP) && !defined(inline)
# define inline LIBXSMM_INLINE_KEYWORD
#endif
#if (0 != LIBXSMM_SYNC)
# if !defined(_REENTRANT)
# define _REENTRANT
# endif
# if defined(__PGI)
# if defined(__GCC_ATOMIC_TEST_AND_SET_TRUEVAL)
# undef __GCC_ATOMIC_TEST_AND_SET_TRUEVAL
# endif
# define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
# endif
#endif
#if !defined(__has_feature) && !defined(__clang__)
# define __has_feature(A) 0
#endif
#if !defined(__has_builtin) && !defined(__clang__)
# define __has_builtin(A) 0
#endif
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
#endif
#if (0 != LIBXSMM_SYNC)
# if defined(_WIN32) || defined(__CYGWIN__)
# include <windows.h>
# else
# include <pthread.h>
# endif
#endif
#if !defined(LIBXSMM_ASSERT)
# include <assert.h>
# if defined(NDEBUG)
# define LIBXSMM_ASSERT(EXPR) LIBXSMM_ASSUME(EXPR)
# else
# define LIBXSMM_ASSERT(EXPR) assert(EXPR)
# endif
#endif
#if !defined(LIBXSMM_ASSERT_MSG)
# define LIBXSMM_ASSERT_MSG(EXPR, MSG) assert((EXPR) && *MSG)
#endif
#if !defined(LIBXSMM_EXPECT_ELIDE)
# define LIBXSMM_EXPECT_ELIDE(RESULT, EXPR) do { \
/*const*/
int libxsmm_expect_result_ = ((RESULT) == (EXPR)); \
LIBXSMM_UNUSED(libxsmm_expect_result_); \
} while(0)
#endif
#if defined(NDEBUG)
# define LIBXSMM_EXPECT LIBXSMM_EXPECT_ELIDE
# define LIBXSMM_EXPECT_NOT LIBXSMM_EXPECT_ELIDE
#else
# define LIBXSMM_EXPECT(RESULT, EXPR) LIBXSMM_ASSERT((RESULT) == (EXPR))
# define LIBXSMM_EXPECT_NOT(RESULT, EXPR) LIBXSMM_ASSERT((RESULT) != (EXPR))
#endif
#if defined(_DEBUG)
# define LIBXSMM_EXPECT_DEBUG LIBXSMM_EXPECT
#else
# define LIBXSMM_EXPECT_DEBUG LIBXSMM_EXPECT_ELIDE
#endif
#if defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP)
# include <omp.h>
#endif
#include <inttypes.h>
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <float.h>
#include <stdio.h>
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(pop)
#endif
#if !defined(FLT_MAX)
# if !defined(__FLT_MAX__)
# define FLT_MAX 3.40282346638528859811704183484516925e+38F
# else
# define FLT_MAX __FLT_MAX__
# endif
#endif
#if !defined(FLT_MIN)
# if !defined(__FLT_MIN__)
# define FLT_MIN 1.17549435082228750796873653722224568e-38F
# else
# define FLT_MIN __FLT_MIN__
# endif
#endif
#if defined(_WIN32) && 0
# define LIBXSMM_SNPRINTF(S, N, ...) _snprintf_s(S, N, _TRUNCATE, __VA_ARGS__)
#elif defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__ || defined(__GNUC__))
# define LIBXSMM_SNPRINTF(S, N, ...) snprintf(S, N, __VA_ARGS__)
#else
# define LIBXSMM_SNPRINTF(S, N, ...) sprintf((S) +
/*unused*/
(N) * 0, __VA_ARGS__)
#endif
#if defined(__THROW) && defined(__cplusplus)
# define LIBXSMM_THROW __THROW
#endif
#if !defined(LIBXSMM_THROW)
# define LIBXSMM_THROW
#endif
#if defined(__GNUC__) && LIBXSMM_VERSION2(4, 2) == LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) && \
!defined(__clang__) && !defined(__PGI) && !defined(__INTEL_COMPILER) && !defined(_CRAYC)
# define LIBXSMM_NOTHROW LIBXSMM_THROW
#else
# define LIBXSMM_NOTHROW
#endif
#if defined(__cplusplus)
# if (__cplusplus > 199711L)
# define LIBXSMM_NOEXCEPT noexcept
# else
# define LIBXSMM_NOEXCEPT throw()
# endif
#else
# define LIBXSMM_NOEXCEPT LIBXSMM_NOTHROW
#endif
#if defined(_WIN32)
# define LIBXSMM_PUTENV(A) _putenv(A)
#else
# define LIBXSMM_PUTENV(A) putenv(A)
#endif
/* block must be after including above header files */
#if (defined(__GLIBC__) && defined(__GLIBC_MINOR__) && LIBXSMM_VERSION2(__GLIBC__, __GLIBC_MINOR__) < LIBXSMM_VERSION2(2, 26)) \
|| (defined(LIBXSMM_INTEL_COMPILER) && (1802 >= LIBXSMM_INTEL_COMPILER) && !defined(__cplusplus) && defined(__linux__))
/* _Float128 was introduced with GNU GCC 7.0. */
# if !defined(_Float128) && !defined(__SIZEOF_FLOAT128__) && defined(__GNUC__) && !defined(__cplusplus) && defined(__linux__)
# define _Float128 __float128
# endif
# if !defined(LIBXSMM_GLIBC_FPTYPES) && defined(__GNUC__) && !defined(__cplusplus) && defined(__linux__) \
&& (LIBXSMM_VERSION2(7, 0) > LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) || \
(defined(LIBXSMM_INTEL_COMPILER) && (1802 >= LIBXSMM_INTEL_COMPILER)))
# define LIBXSMM_GLIBC_FPTYPES
# endif
# if !defined(_Float128X) && defined(LIBXSMM_GLIBC_FPTYPES)
# define _Float128X _Float128
# endif
# if !defined(_Float32) && defined(LIBXSMM_GLIBC_FPTYPES)
# define _Float32 float
# endif
# if !defined(_Float32x) && defined(LIBXSMM_GLIBC_FPTYPES)
# define _Float32x _Float32
# endif
# if !defined(_Float64) && defined(LIBXSMM_GLIBC_FPTYPES)
# define _Float64 double
# endif
# if !defined(_Float64x) && defined(LIBXSMM_GLIBC_FPTYPES)
# define _Float64x _Float64
# endif
#endif
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
#endif
#if defined(LIBXSMM_GLIBC_FPTYPES)
# if defined(__cplusplus)
# undef __USE_MISC
# if !defined(_DEFAULT_SOURCE)
# define _DEFAULT_SOURCE
# endif
# if !defined(_BSD_SOURCE)
# define _BSD_SOURCE
# endif
# else
# if !defined(__PURE_INTEL_C99_HEADERS__)
# define __PURE_INTEL_C99_HEADERS__
# endif
# endif
#endif
#if !defined(LIBXSMM_NO_LIBM)
# if (defined(LIBXSMM_INTEL_COMPILER) && (1800 <= LIBXSMM_INTEL_COMPILER)) \
&& !defined(_WIN32)
/* error including dfp754.h */
# include <mathimf.h>
# endif
# include <math.h>
#endif
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(pop)
#endif
#endif
/*LIBXSMM_MACROS_H*/
third_party/libxsmm/include/libxsmm_malloc.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_MALLOC_H
#define LIBXSMM_MALLOC_H
#include "libxsmm_memory.h"
/* include tensorflow/core/public/version.h prior to LIBXSMM otherwise the current TensorFlow API is assumed */
#if !defined(LIBXSMM_TF12) && (!defined(TF_VERSION_STRING) || \
LIBXSMM_VERSION2(1, 12) <= LIBXSMM_VERSION2(TF_MAJOR_VERSION, TF_MINOR_VERSION))
# define LIBXSMM_TF12
/* TF_PATCH_VERSION does not matter */
#endif
/** Can be used with libxsmm_[get|set]_scratch_limit. */
#define LIBXSMM_SCRATCH_UNLIMITED ((size_t)LIBXSMM_UNLIMITED)
#define LIBXSMM_SCRATCH_DEFAULT 0
/** Function types accepted for memory allocation (see libxsmm_*_allocator). */
LIBXSMM_EXTERN_C
typedef
LIBXSMM_RETARGETABLE
void
*
(
*
libxsmm_malloc_ctx
)(
size_t
/*size*/
,
const
void
*
/*context*/
);
LIBXSMM_EXTERN_C
typedef
LIBXSMM_RETARGETABLE
void
*
(
*
libxsmm_malloc_fun
)(
size_t
/*size*/
);
LIBXSMM_EXTERN_C
typedef
union
LIBXSMM_RETARGETABLE
libxsmm_malloc_function
{
libxsmm_malloc_ctx
ctx_form
;
libxsmm_malloc_fun
function
;
}
libxsmm_malloc_function
;
/** Function types accepted for releasing memory (see libxsmm_*_allocator). */
LIBXSMM_EXTERN_C
typedef
LIBXSMM_RETARGETABLE
void
(
*
libxsmm_free_ctx
)(
void
*
/*buffer*/
,
const
void
*
/*context*/
);
LIBXSMM_EXTERN_C
typedef
LIBXSMM_RETARGETABLE
void
(
*
libxsmm_free_fun
)(
void
*
/*buffer*/
);
LIBXSMM_EXTERN_C
typedef
union
LIBXSMM_RETARGETABLE
libxsmm_free_function
{
libxsmm_free_ctx
ctx_form
;
libxsmm_free_fun
function
;
}
libxsmm_free_function
;
/**
* To setup the custom default memory allocator, either a malloc_fn and a free_fn
* are given, or two NULL-pointers designate to reset the default allocator to a
* library-internal default. If a context is given (non-NULL), the context-based
* form of the memory allocation is used.
* Changing the allocator including the function for deallocation applies to
* upcoming allocation/deallocation and works correctly for pending buffers.
*/
LIBXSMM_API
int
libxsmm_set_default_allocator
(
/* malloc_fn/free_fn must correspond */
const
void
*
context
,
libxsmm_malloc_function
malloc_fn
,
libxsmm_free_function
free_fn
);
/** Retrieve the default memory allocator. */
LIBXSMM_API
int
libxsmm_get_default_allocator
(
const
void
**
context
,
libxsmm_malloc_function
*
malloc_fn
,
libxsmm_free_function
*
free_fn
);
/**
* To setup the scratch memory allocator, a malloc_fn function and an optional free_fn
* are given. A NULL-free acts as a "no-operation", and the deallocation is expected
* to be controlled otherwise. If two NULL-pointers are given, the allocator is reset
* to the currently active default memory allocator. If a context is given (non-NULL),
* the context-based form of the memory allocation is used.
* Changing the allocator including the function for deallocation applies to
* upcoming allocation/deallocation and works correctly for pending buffers.
*/
LIBXSMM_API
int
libxsmm_set_scratch_allocator
(
/* malloc_fn/free_fn must correspond */
const
void
*
context
,
libxsmm_malloc_function
malloc_fn
,
libxsmm_free_function
free_fn
);
/** Retrieve the scratch memory allocator. */
LIBXSMM_API
int
libxsmm_get_scratch_allocator
(
const
void
**
context
,
libxsmm_malloc_function
*
malloc_fn
,
libxsmm_free_function
*
free_fn
);
/** Allocate memory (malloc/free interface). */
LIBXSMM_API
LIBXSMM_ATTRIBUTE_MALLOC
void
*
libxsmm_malloc
(
size_t
size
);
/** Allocate aligned memory using the default allocator. */
LIBXSMM_API
LIBXSMM_ATTRIBUTE_MALLOC
void
*
libxsmm_aligned_malloc
(
size_t
size
,
/**
* =0: align automatically according to the size
* 0<: align according to the alignment value
*/
size_t
alignment
);
/** Reallocate memory using the default allocator (alignment is preserved). */
LIBXSMM_API
void
*
libxsmm_realloc
(
size_t
size
,
void
*
ptr
);
/**
* Allocate aligned scratch memory. It is not supported
* to query properties per libxsmm_get_malloc_info, but
* libxsmm_get_scratch_info can used instead.
*/
LIBXSMM_API
void
*
libxsmm_scratch_malloc
(
size_t
size
,
/**
* =0: align automatically according to the size
* 0<: align according to the alignment value
*/
size_t
alignment
,
/**
* Identifies the call site, which is used
* to determine the memory pool.
*/
const
void
*
caller
);
/**
* Binary form of libxsmm_scratch_malloc, which
* expands the call-context automatically. This
* macro is intentionally lower case.
*/
#define libxsmm_aligned_scratch(size, alignment) \
libxsmm_scratch_malloc(size, alignment, \
LIBXSMM_CALLER_ID)
/** Deallocate memory (malloc/free interface). */
LIBXSMM_API
void
libxsmm_free
(
const
void
*
memory
);
/**
* Release the entire scratch memory regardless
* of whether it is still referenced or not.
*/
LIBXSMM_API
void
libxsmm_release_scratch
(
void
);
/** Information about a buffer (default memory domain). */
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_malloc_info
{
/** Size of the buffer. */
size_t
size
;
}
libxsmm_malloc_info
;
/** Retrieve information about a buffer (default memory domain). */
LIBXSMM_API
int
libxsmm_get_malloc_info
(
const
void
*
memory
,
libxsmm_malloc_info
*
info
);
/** Information about the scratch memory domain. */
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_scratch_info
{
/** Watermark memory across pools (size), unsatisfied (local), and library-internal memory. */
size_t
size
,
local
,
internal
;
/** Pending allocations (not released). */
size_t
npending
;
/** Number of allocations so far. */
size_t
nmallocs
;
/** Number of pools used. */
unsigned
int
npools
;
}
libxsmm_scratch_info
;
/** Retrieve information about the scratch memory domain. */
LIBXSMM_API
int
libxsmm_get_scratch_info
(
libxsmm_scratch_info
*
info
);
/**
* Limit the total size (Bytes) of the scratch memory.
* LIBXSMM_SCRATCH_UNLIMITED removes any limit, and
* LIBXSMM_SCRATCH_DEFAULT populates the default.
* The related environment variable LIBXSMM_SCRATCH_LIMIT
* allows units: <none>/b/B (Bytes), k/K, m/M, and g/G.
*/
LIBXSMM_API
void
libxsmm_set_scratch_limit
(
size_t
nbytes
);
/** Get the maximum size of the scratch memory domain. */
LIBXSMM_API
size_t
libxsmm_get_scratch_limit
(
void
);
/**
* Intercepts malloc/free to use scratch memory allocator.
* (related environment variable LIBXSMM_MALLOC).
* Optionally set the range of malloc-sizes to be intercepted.
* The related environment variable LIBXSMM_MALLOC_LIMIT
* allows units: <none>/b/B (Bytes), k/K, m/M, and g/G.
*/
LIBXSMM_API
void
libxsmm_set_malloc
(
int
enabled
,
const
size_t
*
lo
,
const
size_t
*
hi
);
/**
* Determines if malloc/free are (and can be) intercepted.
* Optionally gets the range of enabled malloc-sizes.
*/
LIBXSMM_API
int
libxsmm_get_malloc
(
size_t
*
lo
,
size_t
*
hi
);
/**
* Calculate the linear offset of the n-dimensional (ndims) offset (can be NULL),
* and the (optional) linear size of the corresponding shape.
*/
LIBXSMM_API
size_t
libxsmm_offset
(
const
size_t
offset
[],
const
size_t
shape
[],
size_t
ndims
,
size_t
*
size
);
#if defined(__cplusplus)
/** RAII idiom to temporarily setup an allocator for the lifetime of the scope. */
template
<
typename
kind
>
class
LIBXSMM_RETARGETABLE
libxsmm_scoped_allocator
{
public:
/** C'tor, which instantiates the new allocator (plain form). */
libxsmm_scoped_allocator
(
libxsmm_malloc_fun
malloc_fn
,
libxsmm_free_fun
free_fn
)
{
kind
::
get
(
m_context
,
m_malloc
,
m_free
);
kind
::
set
(
NULL
/*context*/
,
NULL
/*malloc_ctx*/
,
NULL
/*free_ctx*/
,
malloc_fn
,
free_fn
);
}
/** C'tor, which instantiates the new allocator (context form). */
libxsmm_scoped_allocator
(
const
void
*
context
,
libxsmm_malloc_ctx
malloc_ctx
,
libxsmm_free_ctx
free_ctx
,
libxsmm_malloc_fun
malloc_fun
=
NULL
,
libxsmm_free_fun
free_fun
=
NULL
)
{
kind
::
get
(
m_context
,
m_malloc
,
m_free
);
kind
::
set
(
context
,
malloc_ctx
,
free_ctx
,
malloc_fun
,
free_fun
);
}
/** Following the RAII idiom, the d'tor restores the previous allocator. */
~
libxsmm_scoped_allocator
()
{
kind
::
set
(
m_context
,
m_malloc
.
ctx_form
,
m_free
.
ctx_form
,
m_malloc
.
function
,
m_free
.
function
);
}
private:
/* no copy/assignment */
explicit
libxsmm_scoped_allocator
(
const
libxsmm_scoped_allocator
&
);
libxsmm_scoped_allocator
&
operator
=
(
const
libxsmm_scoped_allocator
&
);
protected:
/* saved/previous allocator */
const
void
*
m_context
;
libxsmm_malloc_function
m_malloc
;
libxsmm_free_function
m_free
;
};
/** Allocator-kind to instantiate libxsmm_scoped_allocator<kind>. */
struct
LIBXSMM_RETARGETABLE
libxsmm_default_allocator
{
static
void
set
(
const
void
*
context
,
libxsmm_malloc_ctx
malloc_ctx
,
libxsmm_free_ctx
free_ctx
,
libxsmm_malloc_fun
malloc_fun
,
libxsmm_free_fun
free_fun
)
{
libxsmm_malloc_function
malloc_fn
;
libxsmm_free_function
free_fn
;
if
(
NULL
==
context
)
{
/* use global form only when no context is given */
malloc_fn
.
function
=
malloc_fun
;
free_fn
.
function
=
free_fun
;
}
else
{
malloc_fn
.
ctx_form
=
malloc_ctx
;
free_fn
.
ctx_form
=
free_ctx
;
}
libxsmm_set_default_allocator
(
context
,
malloc_fn
,
free_fn
);
}
static
void
get
(
const
void
*&
context
,
libxsmm_malloc_function
&
malloc_fn
,
libxsmm_free_function
&
free_fn
)
{
libxsmm_get_default_allocator
(
&
context
,
&
malloc_fn
,
&
free_fn
);
}
};
/** Allocator-kind to instantiate libxsmm_scoped_allocator<kind>. */
struct
LIBXSMM_RETARGETABLE
libxsmm_scratch_allocator
{
static
void
set
(
const
void
*
context
,
libxsmm_malloc_ctx
malloc_ctx
,
libxsmm_free_ctx
free_ctx
,
libxsmm_malloc_fun
malloc_fun
,
libxsmm_free_fun
free_fun
)
{
libxsmm_malloc_function
malloc_fn
;
libxsmm_free_function
free_fn
;
if
(
NULL
!=
context
)
{
/* adopt context form */
malloc_fn
.
function
=
malloc_fun
;
free_fn
.
function
=
free_fun
;
}
else
{
/* adopt global form */
malloc_fn
.
ctx_form
=
malloc_ctx
;
free_fn
.
ctx_form
=
free_ctx
;
}
libxsmm_set_scratch_allocator
(
context
,
malloc_fn
,
free_fn
);
}
static
void
get
(
const
void
*&
context
,
libxsmm_malloc_function
&
malloc_fn
,
libxsmm_free_function
&
free_fn
)
{
libxsmm_get_scratch_allocator
(
&
context
,
&
malloc_fn
,
&
free_fn
);
}
};
/** Forward-declared types/functions used to implement libxsmm_tf_allocator. */
namespace
tensorflow
{
class
Allocator
;
#if defined(LIBXSMM_TF12)
class
DeviceBase
;
int
DeviceNumaNode
(
const
DeviceBase
*
/*device*/
);
Allocator
*
cpu_allocator
(
int
/*numa_node*/
);
#else
Allocator
*
cpu_allocator
();
#endif
}
/**
* An object of this type adopts a memory allocator from TensorFlow.
* All memory allocations of the requested kind within the current
* scope (where the libxsmm_tf_allocator object lives) are subject
* to TensorFlow's memory allocation scheme. The allocation kind
* is usually "libxsmm_scratch_allocator"; using a second object
* of kind "libxsmm_default_allocator" makes the default memory
* allocation of LIBXSMM subject to TensorFlow as well.
*/
template
<
typename
kind
>
class
LIBXSMM_RETARGETABLE
libxsmm_tf_allocator
:
public
libxsmm_scoped_allocator
<
kind
>
{
public:
/** The TensorFlow allocator is adopted from the global CPU memory allocator. */
explicit
libxsmm_tf_allocator
()
:
libxsmm_scoped_allocator
<
kind
>
(
libxsmm_tf_allocator
::
malloc
,
libxsmm_tf_allocator
::
free
)
{}
/** The TensorFlow allocator is adopted from the given OpKernelContext. */
template
<
typename
context_type
>
explicit
libxsmm_tf_allocator
(
context_type
&
context
)
:
libxsmm_scoped_allocator
<
kind
>
(
&
context
,
libxsmm_tf_allocator
::
template
malloc_ctx
<
context_type
>,
libxsmm_tf_allocator
::
template
free_ctx
<
context_type
>,
libxsmm_tf_allocator
::
malloc
,
libxsmm_tf_allocator
::
free
)
{}
/** Global form of allocating memory (malloc signature). */
static
void
*
malloc
(
size_t
size
)
{
#if defined(LIBXSMM_TF12)
return
libxsmm_tf_allocator
::
allocate
(
tensorflow
::
cpu_allocator
(
-
1
/*kNUMANoAffinity*/
),
size
);
#else
return
libxsmm_tf_allocator
::
allocate
(
tensorflow
::
cpu_allocator
(),
size
);
#endif
}
/** Global form of deallocating memory (free signature). */
static
void
free
(
void
*
buffer
)
{
#if defined(LIBXSMM_TF12)
libxsmm_tf_allocator
::
deallocate
(
tensorflow
::
cpu_allocator
(
-
1
/*kNUMANoAffinity*/
),
buffer
);
#else
libxsmm_tf_allocator
::
deallocate
(
tensorflow
::
cpu_allocator
(),
buffer
);
#endif
}
/** Context based form of allocating memory. */
template
<
typename
context_type
>
static
void
*
malloc_ctx
(
const
void
*
context
,
size_t
size
)
{
typedef
typename
context_type
::
WrappedAllocator
::
first_type
allocator_ptr
;
context_type
*
const
tf_context
=
static_cast
<
context_type
*>
(
context
);
allocator_ptr
allocator
=
NULL
;
if
(
NULL
!=
tf_context
)
{
#if !defined(LIBXSMM_TF12)
if
(
NULL
!=
tf_context
->
device
())
{
if
(
0
<
tf_context
->
num_outputs
())
{
allocator
=
tf_context
->
device
()
->
GetStepAllocator
(
tf_context
->
output_alloc_attr
(
0
),
tf_context
->
resource_manager
());
}
else
if
(
0
<
tf_context
->
num_inputs
())
{
allocator
=
tf_context
->
device
()
->
GetStepAllocator
(
tf_context
->
input_alloc_attr
(
0
),
tf_context
->
resource_manager
());
}
}
#else
/* include tensorflow/core/public/version.h prior to LIBXSMM otherwise the current TensorFlow API is assumed */
const
int
numa_node
=
DeviceNumaNode
(
tf_context
->
device
());
allocator
=
tensorflow
::
cpu_allocator
(
numa_node
);
#endif
}
return
libxsmm_tf_allocator
::
allocate
(
allocator
,
size
);
}
/** Context based form of deallocating memory. */
template
<
typename
context_type
>
static
void
free_ctx
(
const
void
*
context
,
void
*
buffer
)
{
typedef
typename
context_type
::
WrappedAllocator
::
first_type
allocator_ptr
;
context_type
*
const
tf_context
=
static_cast
<
context_type
*>
(
context
);
allocator_ptr
allocator
=
NULL
;
if
(
NULL
!=
tf_context
)
{
#if defined(LIBXSMM_TF12)
const
int
numa_node
=
DeviceNumaNode
(
tf_context
->
device
());
allocator
=
tensorflow
::
cpu_allocator
(
numa_node
);
#else
if
(
NULL
!=
tf_context
->
device
())
{
if
(
0
<
tf_context
->
num_outputs
())
{
allocator
=
tf_context
->
device
()
->
GetStepAllocator
(
tf_context
->
output_alloc_attr
(
0
),
tf_context
->
resource_manager
());
}
else
if
(
0
<
tf_context
->
num_inputs
())
{
allocator
=
tf_context
->
device
()
->
GetStepAllocator
(
tf_context
->
input_alloc_attr
(
0
),
tf_context
->
resource_manager
());
}
}
#endif
}
libxsmm_tf_allocator
::
deallocate
(
allocator
,
buffer
);
}
private:
template
<
typename
allocator_ptr
>
/* break interface dependency with TF */
static
void
*
allocate
(
allocator_ptr
allocator
,
size_t
size
)
{
void
*
result
;
if
(
NULL
!=
allocator
)
{
/* no (useless) waste with alignment; raw result is re-aligned anyways */
result
=
allocator
->
AllocateRaw
(
1
/*alignment*/
,
size
);
}
else
{
LIBXSMM_ASSERT_MSG
(
0
/*false*/
,
"LIBXSMM ERROR: memory allocator is missing"
);
result
=
NULL
;
}
return
result
;
}
template
<
typename
allocator_ptr
>
/* break interface dependency with TF */
static
void
deallocate
(
allocator_ptr
allocator
,
void
*
buffer
)
{
LIBXSMM_ASSERT_MSG
(
NULL
!=
allocator
,
"LIBXSMM ERROR: memory allocator is missing"
);
if
(
NULL
!=
allocator
)
allocator
->
DeallocateRaw
(
buffer
);
}
};
#endif
/*defined(__cplusplus)*/
#endif
/*LIBXSMM_MALLOC_H*/
third_party/libxsmm/include/libxsmm_math.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_MATH_H
#define LIBXSMM_MATH_H
#include "libxsmm_typedefs.h"
/**
* Structure of differences with matrix norms according
* to http://www.netlib.org/lapack/lug/node75.html).
*/
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_matdiff_info
{
/** One-norm */
double
norm1_abs
,
norm1_rel
;
/** Infinity-norm */
double
normi_abs
,
normi_rel
;
/** Froebenius-norm */
double
normf_rel
;
/** Maximum difference, L2-norm (absolute and relative), and R-squared. */
double
linf_abs
,
linf_rel
,
l2_abs
,
l2_rel
,
rsq
;
/** Statistics: sum/l1, min., max., arith. avg., and variance. */
double
l1_ref
,
min_ref
,
max_ref
,
avg_ref
,
var_ref
;
/** Statistics: sum/l1, min., max., arith. avg., and variance. */
double
l1_tst
,
min_tst
,
max_tst
,
avg_tst
,
var_tst
;
/** Values (v_ref, v_tst) and location (m, n) of largest linf_abs. */
double
v_ref
,
v_tst
;
libxsmm_blasint
m
,
n
;
}
libxsmm_matdiff_info
;
/**
* Utility function to calculate a collection of scalar differences between two matrices (libxsmm_matdiff_info).
* The location (m, n) of the largest difference (linf_abs) is recorded (also in case of NaN). In case of NaN,
* differences are set to infinity. If no difference is discovered, the location (m, n) is negative (OOB).
*/
LIBXSMM_API
int
libxsmm_matdiff
(
libxsmm_matdiff_info
*
info
,
libxsmm_datatype
datatype
,
libxsmm_blasint
m
,
libxsmm_blasint
n
,
const
void
*
ref
,
const
void
*
tst
,
const
libxsmm_blasint
*
ldref
,
const
libxsmm_blasint
*
ldtst
);
/**
* Reduces input into output such that the difference is maintained or increased (max function).
* The very first (initial) output should be zeroed (libxsmm_matdiff_clear).
*/
LIBXSMM_API
void
libxsmm_matdiff_reduce
(
libxsmm_matdiff_info
*
output
,
const
libxsmm_matdiff_info
*
input
);
/** Clears the given info-structure, e.g., for the initial reduction-value (libxsmm_matdiff_reduce). */
LIBXSMM_API
void
libxsmm_matdiff_clear
(
libxsmm_matdiff_info
*
info
);
/** Greatest common divisor (corner case: the GCD of 0 and 0 is 1). */
LIBXSMM_API
size_t
libxsmm_gcd
(
size_t
a
,
size_t
b
);
/** Least common multiple. */
LIBXSMM_API
size_t
libxsmm_lcm
(
size_t
a
,
size_t
b
);
/**
* This function finds prime-factors (up to 32) of an unsigned integer in ascending order, and
* returns the number of factors found (zero if the given number is prime and unequal to two).
*/
LIBXSMM_API
int
libxsmm_primes_u32
(
unsigned
int
num
,
unsigned
int
num_factors_n32
[]);
/** Calculate co-prime number <= n/2 (except: libxsmm_shuffle(0|1) == 0). */
LIBXSMM_API
size_t
libxsmm_shuffle
(
unsigned
int
n
);
/**
* Divides the product into prime factors and selects factors such that the new product is within
* the given limit (0/1-Knapsack problem), e.g., product=12=2*2*3 and limit=6 then result=2*3=6.
* The limit is at least reached or exceeded with the minimal possible product (is_lower=true).
*/
LIBXSMM_API
unsigned
int
libxsmm_product_limit
(
unsigned
int
product
,
unsigned
int
limit
,
int
is_lower
);
/* Kahan's summation returns accumulator += value and updates compensation. */
LIBXSMM_API
double
libxsmm_kahan_sum
(
double
value
,
double
*
accumulator
,
double
*
compensation
);
/** SQRT with Newton's method using integer arithmetic. */
LIBXSMM_API
unsigned
int
libxsmm_isqrt_u64
(
unsigned
long
long
x
);
/** SQRT with Newton's method using integer arithmetic. */
LIBXSMM_API
unsigned
int
libxsmm_isqrt_u32
(
unsigned
int
x
);
/** Based on libxsmm_isqrt_u32, but actual factor of x. */
LIBXSMM_API
unsigned
int
libxsmm_isqrt2_u32
(
unsigned
int
x
);
/** SQRT with Newton's method using double-precision. */
LIBXSMM_API
double
libxsmm_dsqrt
(
double
x
);
/** SQRT with Newton's method using single-precision. */
LIBXSMM_API
float
libxsmm_ssqrt
(
float
x
);
/** CBRT with Newton's method using integer arithmetic. */
LIBXSMM_API
unsigned
int
libxsmm_icbrt_u64
(
unsigned
long
long
x
);
/** CBRT with Newton's method using integer arithmetic. */
LIBXSMM_API
unsigned
int
libxsmm_icbrt_u32
(
unsigned
int
x
);
/** Single-precision approximation of exponential function (base 2). */
LIBXSMM_API
float
libxsmm_sexp2
(
float
x
);
/**
* Exponential function (base 2), which is limited to unsigned 8-bit input values.
* This function reproduces bit-accurate results (single-precision).
*/
LIBXSMM_API
float
libxsmm_sexp2_u8
(
unsigned
char
x
);
/**
* Exponential function (base 2), which is limited to signed 8-bit input values.
* This function reproduces bit-accurate results (single-precision).
*/
LIBXSMM_API
float
libxsmm_sexp2_i8
(
signed
char
x
);
/** Similar to libxsmm_sexp2_i8, but takes an integer as signed 8-bit value (check). */
LIBXSMM_API
float
libxsmm_sexp2_i8i
(
int
x
);
/** Inlineable fast tanh, such that a the compiler can potentially vectorize. */
LIBXSMM_API_INLINE
float
libxsmm_stanh_pade78
(
float
i_x
)
{
const
float
l_c0
=
2027025
.
0
f
;
const
float
l_c1
=
270270
.
0
f
;
const
float
l_c2
=
6930
.
0
f
;
const
float
l_c3
=
36
.
0
f
;
const
float
l_c1_d
=
945945
.
0
f
;
const
float
l_c2_d
=
51975
.
0
f
;
const
float
l_c3_d
=
630
.
0
f
;
const
float
l_hi_bound
=
4
.
97
f
;
const
float
l_lo_bound
=
-
4
.
97
f
;
const
float
l_ones
=
1
.
0
f
;
const
float
l_neg_ones
=
-
1
.
0
f
;
const
float
x2
=
i_x
*
i_x
;
const
float
t1_nom
=
(
l_c3
*
x2
)
+
l_c2
;
const
float
t2_nom
=
(
t1_nom
*
x2
)
+
l_c1
;
const
float
t3_nom
=
(
t2_nom
*
x2
)
+
l_c0
;
const
float
nom
=
t3_nom
*
i_x
;
const
float
t1_denom
=
x2
+
l_c3_d
;
const
float
t2_denom
=
(
t1_denom
*
x2
)
+
l_c2_d
;
const
float
t3_denom
=
(
t2_denom
*
x2
)
+
l_c1_d
;
const
float
denom
=
(
t3_denom
*
x2
)
+
l_c0
;
float
result
=
nom
/
denom
;
result
=
(
result
>
l_hi_bound
)
?
l_ones
:
result
;
result
=
(
result
<
l_lo_bound
)
?
l_neg_ones
:
result
;
return
result
;
}
#endif
/*LIBXSMM_MATH_H*/
third_party/libxsmm/include/libxsmm_memory.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_MEMORY_H
#define LIBXSMM_MEMORY_H
#include "libxsmm_macros.h"
#if defined(__clang_analyzer__)
# define LIBXSMM_MEMSET127(PTRDST, VALUE, SIZE) memset((void*)(PTRDST), VALUE, SIZE)
#else
# define LIBXSMM_MEMSET127(PTRDST, VALUE, SIZE) { \
char *const libxsmm_memset127_dst_ = (char*)(PTRDST); \
union { size_t size; signed char size1; } libxsmm_memset127_; \
signed char libxsmm_memset127_i_; LIBXSMM_ASSERT((SIZE) <= 127); \
libxsmm_memset127_.size = (SIZE); \
LIBXSMM_PRAGMA_UNROLL \
for (libxsmm_memset127_i_ = 0; libxsmm_memset127_i_ < libxsmm_memset127_.size1; \
++libxsmm_memset127_i_) \
{ \
libxsmm_memset127_dst_[libxsmm_memset127_i_] = (char)(VALUE); \
} \
}
#endif
#define LIBXSMM_MEMZERO127(PTRDST) LIBXSMM_MEMSET127(PTRDST, '\0', sizeof(*(PTRDST)))
#define LIBXSMM_MEMCPY127_LOOP(PTRDST, PTRSRC, SIZE, NTS) { \
const unsigned char *const libxsmm_memcpy127_loop_src_ = (const unsigned char*)(PTRSRC); \
unsigned char *const libxsmm_memcpy127_loop_dst_ = (unsigned char*)(PTRDST); \
signed char libxsmm_memcpy127_loop_i_; LIBXSMM_ASSERT((SIZE) <= 127); \
NTS(libxsmm_memcpy127_loop_dst_) LIBXSMM_PRAGMA_UNROLL \
for (libxsmm_memcpy127_loop_i_ = 0; libxsmm_memcpy127_loop_i_ < (signed char)(SIZE); \
++libxsmm_memcpy127_loop_i_) \
{ \
libxsmm_memcpy127_loop_dst_[libxsmm_memcpy127_loop_i_] = \
libxsmm_memcpy127_loop_src_[libxsmm_memcpy127_loop_i_]; \
} \
}
#define LIBXSMM_MEMCPY127_NTS(...)
#define LIBXSMM_MEMCPY127(PTRDST, PTRSRC, SIZE) \
LIBXSMM_MEMCPY127_LOOP(PTRDST, PTRSRC, SIZE, LIBXSMM_MEMCPY127_NTS)
#define LIBXSMM_ASSIGN127(PTRDST, PTRSRC) LIBXSMM_ASSERT(sizeof(*(PTRSRC)) <= sizeof(*(PTRDST))); \
LIBXSMM_MEMCPY127(PTRDST, PTRSRC, sizeof(*(PTRSRC)))
/**
* Calculates if there is a difference between two (short) buffers.
* Returns zero if there is no difference; otherwise non-zero.
*/
LIBXSMM_API
unsigned
char
libxsmm_diff
(
const
void
*
a
,
const
void
*
b
,
unsigned
char
size
);
/**
* Calculates if there is a difference between "a" and "n x b".
* Returns the index of the first match (or "n" in case of no match).
*/
LIBXSMM_API
unsigned
int
libxsmm_diff_n
(
const
void
*
a
,
const
void
*
bn
,
unsigned
char
size
,
unsigned
char
stride
,
unsigned
int
hint
,
unsigned
int
n
);
/** Similar to memcmp (C standard library), but the result is conceptually only a boolean. */
LIBXSMM_API
int
libxsmm_memcmp
(
const
void
*
a
,
const
void
*
b
,
size_t
size
);
/** Calculate a hash value for the given buffer and seed; accepts NULL-buffer. */
LIBXSMM_API
unsigned
int
libxsmm_hash
(
const
void
*
data
,
unsigned
int
size
,
unsigned
int
seed
);
/** Calculate a 64-bit hash for the given character string; accepts NULL-string. */
LIBXSMM_API
unsigned
long
long
libxsmm_hash_string
(
const
char
*
string
);
/** Return the pointer to the 1st match of "b" in "a", or NULL (no match). */
LIBXSMM_API
const
char
*
libxsmm_stristr
(
const
char
*
a
,
const
char
*
b
);
/**
* Check if pointer is SIMD-aligned and optionally consider the next access (increment in Bytes).
* Optionally calculates the alignment of the given pointer in Bytes.
*/
LIBXSMM_API
int
libxsmm_aligned
(
const
void
*
ptr
,
const
size_t
*
inc
,
int
*
alignment
);
#endif
/*LIBXSMM_MEMORY_H*/
third_party/libxsmm/include/libxsmm_mhd.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_MHD_H
#define LIBXSMM_MHD_H
#include "libxsmm_typedefs.h"
/** Denotes the element/pixel type of an image/channel. */
typedef
enum
libxsmm_mhd_elemtype
{
LIBXSMM_MHD_ELEMTYPE_F64
=
LIBXSMM_DATATYPE_F64
,
/* MET_DOUBLE */
LIBXSMM_MHD_ELEMTYPE_F32
=
LIBXSMM_DATATYPE_F32
,
/* MET_FLOAT */
LIBXSMM_MHD_ELEMTYPE_BF16
=
LIBXSMM_DATATYPE_BF16
,
/* MET_BFLOAT */
LIBXSMM_MHD_ELEMTYPE_I64
=
LIBXSMM_DATATYPE_I64
,
/* MET_LONG */
LIBXSMM_MHD_ELEMTYPE_I32
=
LIBXSMM_DATATYPE_I32
,
/* MET_INT */
LIBXSMM_MHD_ELEMTYPE_I16
=
LIBXSMM_DATATYPE_I16
,
/* MET_SHORT */
LIBXSMM_MHD_ELEMTYPE_I8
=
LIBXSMM_DATATYPE_I8
,
/* MET_CHAR */
LIBXSMM_MHD_ELEMTYPE_U64
=
LIBXSMM_DATATYPE_UNSUPPORTED
,
/* MET_ULONG */
LIBXSMM_MHD_ELEMTYPE_U32
,
/* MET_UINT */
LIBXSMM_MHD_ELEMTYPE_U16
,
/* MET_USHORT */
LIBXSMM_MHD_ELEMTYPE_U8
,
/* MET_UCHAR */
LIBXSMM_MHD_ELEMTYPE_UNKNOWN
}
libxsmm_mhd_elemtype
;
/**
* Function type used for custom data-handler or element conversion.
* The value-range (src_min, src_max) may be used to scale values
* in case of a type-conversion.
*/
LIBXSMM_EXTERN_C
typedef
LIBXSMM_RETARGETABLE
int
(
*
libxsmm_mhd_element_handler
)(
void
*
dst
,
libxsmm_mhd_elemtype
dst_type
,
libxsmm_mhd_elemtype
src_type
,
const
void
*
src
,
const
void
*
src_min
,
const
void
*
src_max
);
/**
* Predefined function to perform element data conversion.
* Scales source-values in case of non-NULL src_min and src_max,
* or otherwise clamps to the destination-type.
*/
LIBXSMM_API
int
libxsmm_mhd_element_conversion
(
void
*
dst
,
libxsmm_mhd_elemtype
dst_type
,
libxsmm_mhd_elemtype
src_type
,
const
void
*
src
,
const
void
*
src_min
,
const
void
*
src_max
);
/**
* Predefined function to check a buffer against file content.
* In case of different types, libxsmm_mhd_element_conversion
* is performed to compare values using the source-type.
*/
LIBXSMM_API
int
libxsmm_mhd_element_comparison
(
void
*
dst
,
libxsmm_mhd_elemtype
dst_type
,
libxsmm_mhd_elemtype
src_type
,
const
void
*
src
,
const
void
*
src_min
,
const
void
*
src_max
);
/** Returns the name and size of the element type; result may be NULL/0 in case of an unknown type. */
LIBXSMM_API
const
char
*
libxsmm_mhd_typename
(
libxsmm_mhd_elemtype
type
,
size_t
*
typesize
,
const
char
**
ctypename
);
/** Returns the type of the element for a given type-name. */
LIBXSMM_API
libxsmm_mhd_elemtype
libxsmm_mhd_typeinfo
(
const
char
elemname
[]);
/**
* Parse the header of an MHD-file. The header can be part of the data file (local),
* or separately stored (header: MHD, data MHA or RAW).
*/
LIBXSMM_API
int
libxsmm_mhd_read_header
(
/* Filename referring to the header-file (may also contain the data). */
const
char
header_filename
[],
/* Maximum length of path/file name. */
size_t
filename_max_length
,
/* Filename containing the data (may be the same as the header-file). */
char
filename
[],
/* Yields the maximum/possible number of dimensions on input,
* and the actual number of dimensions on output. */
size_t
*
ndims
,
/* Image extents ("ndims" number of entries). */
size_t
size
[],
/* Number of interleaved image channels. */
size_t
*
ncomponents
,
/* Type of the image elements (pixel type). */
libxsmm_mhd_elemtype
*
type
,
/* Size of the header in bytes; may be used to skip the header,
* when reading content; can be a NULL-argument (optional). */
size_t
*
header_size
,
/* Size (in Bytes) of an user-defined extended data record;
* can be a NULL-argument (optional). */
size_t
*
extension_size
);
/**
* Loads the data file, and optionally allows data conversion.
* Conversion is performed such that values are clamped to fit
* into the destination.
*/
LIBXSMM_API
int
libxsmm_mhd_read
(
/* Filename referring to the data. */
const
char
filename
[],
/* Offset within pitched buffer (NULL: no offset). */
const
size_t
offset
[],
/* Image dimensions (extents). */
const
size_t
size
[],
/* Leading buffer dimensions (NULL: same as size). */
const
size_t
pitch
[],
/* Dimensionality (number of entries in size). */
size_t
ndims
,
/* Number of interleaved image channels. */
size_t
ncomponents
,
/* Used to skip the header, and to only read the data. */
size_t
header_size
,
/* Data element type as stored (pixel type). */
libxsmm_mhd_elemtype
type_stored
,
/* Storage type (data conversion, optional). */
const
libxsmm_mhd_elemtype
*
type_data
,
/* Buffer where the data is read into. */
void
*
data
,
/**
* Optional callback executed per entry when reading the data.
* May assign the value to the left-most argument, but also
* allows to only compare with present data. Can be used to
* avoid allocating an actual destination.
*/
libxsmm_mhd_element_handler
handle_element
,
/* Post-content data (extension, optional). */
char
extension
[],
/* Size of the extension; can be zero. */
size_t
extension_size
);
/**
* Save a file using an extended data format, which is compatible with the Meta Image Format (MHD).
* The file is suitable for visual inspection using, e.g., ITK-SNAP or ParaView.
*/
LIBXSMM_API
int
libxsmm_mhd_write
(
const
char
filename
[],
/* Offset within pitched buffer (NULL: no offset). */
const
size_t
offset
[],
/* Image dimensions (extents). */
const
size_t
size
[],
/* Leading buffer dimensions (NULL: same as size). */
const
size_t
pitch
[],
/* Dimensionality, i.e., number of entries in data_size/size. */
size_t
ndims
,
/* Number of pixel components. */
size_t
ncomponents
,
/* Type (input). */
libxsmm_mhd_elemtype
type_data
,
/* Type (data conversion, optional). */
const
libxsmm_mhd_elemtype
*
type
,
/* Raw data to be saved. */
const
void
*
data
,
/* Size of the header; can be a NULL-argument (optional). */
size_t
*
header_size
,
/* Extension header data; can be NULL. */
const
char
extension_header
[],
/* Extension data stream; can be NULL. */
const
void
*
extension
,
/* Extension data size; can be NULL. */
size_t
extension_size
);
#endif
/*LIBXSMM_MHD_H*/
third_party/libxsmm/include/libxsmm_rng.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Alexander Heinecke, Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_RNG_H
#define LIBXSMM_RNG_H
#include "libxsmm_typedefs.h"
/**
* create a new external state for thread-save execution managed
* by the user. We do not provide a function for drawing the random numbers
* the user is supposed to call the LIBXSMM_INTRINSICS_MM512_RNG_EXTSTATE_PS
* or LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EXTSTATE_EPI32 intrinsic.
* */
LIBXSMM_API
unsigned
int
*
libxsmm_rng_create_extstate
(
unsigned
int
/*uint32_t*/
seed
);
/** free a previously created rng_avx512_extstate */
LIBXSMM_API
void
libxsmm_rng_destroy_extstate
(
unsigned
int
*
stateptr
);
/** Set the seed of libxsmm_rng_* (similar to srand). */
LIBXSMM_API
void
libxsmm_rng_set_seed
(
unsigned
int
/*uint32_t*/
seed
);
/**
* This SP-RNG is using xoshiro128+ 1.0, work done by
* David Blackman and Sebastiano Vigna (vigna@acm.org).
* It is their best and fastest 32-bit generator for
* 32-bit floating-point numbers. They suggest to use
* its upper bits for floating-point generation, what
* we do here and generate numbers in [0,1(.
*/
LIBXSMM_API
void
libxsmm_rng_f32_seq
(
float
*
rngs
,
libxsmm_blasint
count
);
/**
* Returns a (pseudo-)random value based on rand/rand48 in the interval [0, n).
* This function compensates for an n, which is not a factor of RAND_MAX.
* Note: libxsmm_rng_set_seed must be used if one wishes to seed the generator.
*/
LIBXSMM_API
unsigned
int
libxsmm_rng_u32
(
unsigned
int
n
);
/** Sequence of random data based on libxsmm_rng_u32. */
LIBXSMM_API
void
libxsmm_rng_seq
(
void
*
data
,
libxsmm_blasint
nbytes
);
/**
* Similar to libxsmm_rng_u32, but returns a DP-value in the interval [0, 1).
* Note: libxsmm_rng_set_seed must be used if one wishes to seed the generator.
*/
LIBXSMM_API
double
libxsmm_rng_f64
(
void
);
#endif
/* LIBXSMM_RNG_H */
third_party/libxsmm/include/libxsmm_source.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_SOURCE_H
#define LIBXSMM_SOURCE_H
#if defined(LIBXSMM_MACROS_H)
# error Please do not include any LIBXSMM header other than libxsmm_source.h!
#endif
#if defined(LIBXSMM_BUILD)
# error LIBXSMM_BUILD cannot be defined for the header-only LIBXSMM!
#endif
/**
* This header is intentionally called "libxsmm_source.h" since the followings block
* includes *internal* files, and thereby exposes LIBXSMM's implementation.
* The so-called "header-only" usage model gives up the clearly defined binary interface
* (including support for hot-fixes after deployment), and requires to rebuild client
* code for every (internal) change of LIBXSMM. Please make sure to only rely on the
* public interface as the internal implementation may change without notice.
*/
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
#endif
#include "../src/generator_aarch64_instructions.c"
#include "../src/generator_common.c"
#include "../src/generator_common_aarch64.c"
#include "../src/generator_common_x86.c"
#include "../src/generator_gemm.c"
#include "../src/generator_gemm_aarch64.c"
#include "../src/generator_gemm_amx.c"
#include "../src/generator_gemm_amx_emu.c"
#include "../src/generator_gemm_amx_microkernel.c"
#include "../src/generator_gemm_amx_microkernel_emu.c"
#include "../src/generator_gemm_avx2_microkernel.c"
#include "../src/generator_gemm_avx512_microkernel.c"
#include "../src/generator_gemm_avx_microkernel.c"
#include "../src/generator_gemm_common.c"
#include "../src/generator_gemm_common_aarch64.c"
#include "../src/generator_gemm_noarch.c"
#include "../src/generator_gemm_sse_avx_avx2_avx512.c"
#include "../src/generator_gemm_sse_microkernel.c"
#include "../src/generator_mateltwise.c"
#include "../src/generator_mateltwise_misc_avx_avx512.c"
#include "../src/generator_mateltwise_reduce_avx_avx512.c"
#include "../src/generator_mateltwise_sse_avx_avx512.c"
#include "../src/generator_mateltwise_transform_avx.c"
#include "../src/generator_mateltwise_transform_avx512.c"
#include "../src/generator_mateltwise_transform_common.c"
#include "../src/generator_mateltwise_transform_common_x86.c"
#include "../src/generator_mateltwise_transform_sse.c"
#include "../src/generator_mateltwise_unary_binary_avx_avx512.c"
#include "../src/generator_matequation.c"
#include "../src/generator_matequation_avx_avx512.c"
#include "../src/generator_matequation_regblocks_avx_avx512.c"
#include "../src/generator_matequation_scratch_avx_avx512.c"
#include "../src/generator_packed_gemm_ac_rm.c"
#include "../src/generator_packed_gemm_ac_rm_aarch64.c"
#include "../src/generator_packed_gemm_ac_rm_avx_avx2_avx512.c"
#include "../src/generator_packed_gemm_bc_rm.c"
#include "../src/generator_packed_gemm_bc_rm_aarch64.c"
#include "../src/generator_packed_gemm_bc_rm_avx_avx2_avx512.c"
#include "../src/generator_packed_spgemm.c"
#include "../src/generator_packed_spgemm_csc_bsparse.c"
#include "../src/generator_packed_spgemm_csc_bsparse_aarch64.c"
#include "../src/generator_packed_spgemm_csc_bsparse_avx_avx2_avx512.c"
#include "../src/generator_packed_spgemm_csc_csparse.c"
#include "../src/generator_packed_spgemm_csc_csparse_avx_avx2_avx512.c"
#include "../src/generator_packed_spgemm_csr_asparse.c"
#include "../src/generator_packed_spgemm_csr_asparse_aarch64.c"
#include "../src/generator_packed_spgemm_csr_asparse_avx_avx2_avx512.c"
#include "../src/generator_packed_spgemm_csr_bsparse.c"
#include "../src/generator_packed_spgemm_csr_bsparse_aarch64.c"
#include "../src/generator_packed_spgemm_csr_bsparse_avx_avx2_avx512.c"
#include "../src/generator_spgemm.c"
#include "../src/generator_spgemm_csc_asparse.c"
#include "../src/generator_spgemm_csc_bsparse.c"
#include "../src/generator_spgemm_csc_reader.c"
#include "../src/generator_spgemm_csr_asparse.c"
#include "../src/generator_spgemm_csr_asparse_reg.c"
#include "../src/generator_spgemm_csr_reader.c"
#include "../src/generator_x86_instructions.c"
#include "../src/libxsmm_cpuid_arm.c"
#include "../src/libxsmm_cpuid_x86.c"
#include "../src/libxsmm_dnn.c"
#include "../src/libxsmm_dnn_convolution.c"
#include "../src/libxsmm_dnn_convolution_backward.c"
#include "../src/libxsmm_dnn_convolution_forward.c"
#include "../src/libxsmm_dnn_convolution_weight_update.c"
#include "../src/libxsmm_dnn_elementwise.c"
#include "../src/libxsmm_dnn_fullyconnected.c"
#include "../src/libxsmm_dnn_fullyconnected_backward_weight_update.c"
#include "../src/libxsmm_dnn_fullyconnected_forward.c"
#include "../src/libxsmm_dnn_fusedbatchnorm.c"
#include "../src/libxsmm_dnn_fusedbatchnorm_backward.c"
#include "../src/libxsmm_dnn_fusedbatchnorm_forward.c"
#include "../src/libxsmm_dnn_fusedgroupnorm.c"
#include "../src/libxsmm_dnn_fusedgroupnorm_backward.c"
#include "../src/libxsmm_dnn_fusedgroupnorm_forward.c"
#include "../src/libxsmm_dnn_optimizer.c"
#include "../src/libxsmm_dnn_optimizer_sgd.c"
#include "../src/libxsmm_dnn_pooling.c"
#include "../src/libxsmm_dnn_pooling_backward.c"
#include "../src/libxsmm_dnn_pooling_forward.c"
#include "../src/libxsmm_dnn_rnncell.c"
#include "../src/libxsmm_dnn_rnncell_backward_weight_update.c"
#include "../src/libxsmm_dnn_rnncell_forward.c"
#include "../src/libxsmm_dnn_softmaxloss.c"
#include "../src/libxsmm_dnn_softmaxloss_backward.c"
#include "../src/libxsmm_dnn_softmaxloss_forward.c"
#include "../src/libxsmm_dnn_tensor.c"
#include "../src/libxsmm_ext.c"
#include "../src/libxsmm_ext_gemm.c"
#include "../src/libxsmm_ext_xcopy.c"
#include "../src/libxsmm_fsspmdm.c"
#include "../src/libxsmm_gemm.c"
#include "../src/libxsmm_generator.c"
#include "../src/libxsmm_hash.c"
#include "../src/libxsmm_main.c"
#include "../src/libxsmm_malloc.c"
#include "../src/libxsmm_math.c"
#include "../src/libxsmm_matrixeqn.c"
#include "../src/libxsmm_memory.c"
#include "../src/libxsmm_mhd.c"
#include "../src/libxsmm_perf.c"
#include "../src/libxsmm_python.c"
#include "../src/libxsmm_rng.c"
#include "../src/libxsmm_spmdm.c"
#include "../src/libxsmm_sync.c"
#include "../src/libxsmm_timer.c"
#include "../src/libxsmm_trace.c"
#include "../src/libxsmm_xcopy.c"
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(pop)
#endif
#endif
/*LIBXSMM_SOURCE_H*/
third_party/libxsmm/include/libxsmm_spmdm.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Nadathur Satish (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_SPMDM_H
#define LIBXSMM_SPMDM_H
#include "libxsmm_typedefs.h"
typedef
enum
libxsmm_spmdm_datatype
{
LIBXSMM_SPMDM_DATATYPE_F32
,
LIBXSMM_SPMDM_DATATYPE_BFLOAT16
}
libxsmm_spmdm_datatype
;
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_spmdm_handle
{
/* The following are the matrix multiply dimensions: A (sparse): m X k, B (dense): k X n, Output C (dense): m X n */
int
m
;
int
n
;
int
k
;
/* The block sizes for A, B and C. */
/* Here we fix A to be divided into 128 X 128 blocks, B/C to be 128 X 48 for HSW/BDW and 128 X 96 for SKX */
int
bm
;
int
bn
;
int
bk
;
/* The number of blocks for the m, n and k dimensions */
int
mb
;
int
nb
;
int
kb
;
libxsmm_spmdm_datatype
datatype
;
char
*
base_ptr_scratch_A
;
char
*
base_ptr_scratch_B_scratch_C
;
int
memory_for_scratch_per_thread
;
}
libxsmm_spmdm_handle
;
/**
* This stores a single sparse splice (or block) of sparse matrix A using a CSR representation (rowidx, colidx, and values
* Each splice corresponds to a bm X bk region of A, and stores local indexes
*/
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_CSR_sparseslice
{
/* Since bm and bk are assumed to be <=256, a 16-bit integer is enough to store the local rowidx, colidx */
uint16_t
*
rowidx
;
uint16_t
*
colidx
;
float
*
values
;
}
libxsmm_CSR_sparseslice
;
LIBXSMM_API
void
libxsmm_spmdm_init
(
int
M
,
int
N
,
int
K
,
int
max_threads
,
libxsmm_spmdm_handle
*
handle
,
libxsmm_CSR_sparseslice
**
libxsmm_output_csr
);
LIBXSMM_API
void
libxsmm_spmdm_destroy
(
libxsmm_spmdm_handle
*
handle
);
LIBXSMM_API
int
libxsmm_spmdm_get_num_createSparseSlice_blocks
(
const
libxsmm_spmdm_handle
*
handle
);
LIBXSMM_API
int
libxsmm_spmdm_get_num_compute_blocks
(
const
libxsmm_spmdm_handle
*
handle
);
/** This converts a dense representation of the sparse matrix to 2D array of sparse slices. */
LIBXSMM_API
void
libxsmm_spmdm_createSparseSlice_fp32_thread
(
const
libxsmm_spmdm_handle
*
handle
,
char
transa
,
const
float
*
a
,
libxsmm_CSR_sparseslice
*
libxsmm_output_csr_a
,
int
block_id
,
int
tid
,
int
nthreads
);
LIBXSMM_API
void
libxsmm_spmdm_createSparseSlice_bfloat16_thread
(
const
libxsmm_spmdm_handle
*
handle
,
char
transa
,
const
libxsmm_bfloat16
*
a
,
libxsmm_CSR_sparseslice
*
libxsmm_output_csr_a
,
int
block_id
,
int
tid
,
int
nthreads
);
/** NOTE: This code currently ignores alpha input to the matrix multiply */
LIBXSMM_API
void
libxsmm_spmdm_compute_fp32_thread
(
const
libxsmm_spmdm_handle
*
handle
,
char
transa
,
char
transb
,
const
float
*
alpha
,
libxsmm_CSR_sparseslice
*
a_sparse
,
const
float
*
b
,
char
transc
,
const
float
*
beta
,
float
*
c
,
int
block_id
,
int
tid
,
int
nthreads
);
/** NOTE: This code currently ignores alpha input to the matrix multiply */
LIBXSMM_API
void
libxsmm_spmdm_compute_bfloat16_thread
(
const
libxsmm_spmdm_handle
*
handle
,
char
transa
,
char
transb
,
const
libxsmm_bfloat16
*
alpha
,
libxsmm_CSR_sparseslice
*
a_sparse
,
const
libxsmm_bfloat16
*
b
,
char
transc
,
const
libxsmm_bfloat16
*
beta
,
float
*
c
,
int
block_id
,
int
tid
,
int
nthreads
);
#endif
/*LIBXSMM_SPMDM_H*/
third_party/libxsmm/include/libxsmm_sync.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_SYNC_H
#define LIBXSMM_SYNC_H
#include "libxsmm_intrinsics_x86.h"
#if !defined(LIBXSMM_TLS)
# if (0 != LIBXSMM_SYNC) && !defined(LIBXSMM_NO_TLS)
# if defined(__CYGWIN__) && defined(__clang__)
# define LIBXSMM_NO_TLS
# define LIBXSMM_TLS
# else
# if (defined(_WIN32) && !defined(__GNUC__) && !defined(__clang__)) || (defined(__PGI) && !defined(__cplusplus))
# define LIBXSMM_TLS LIBXSMM_ATTRIBUTE(thread)
# elif defined(__GNUC__) || defined(__clang__) || defined(_CRAYC)
# define LIBXSMM_TLS __thread
# elif defined(__cplusplus)
# define LIBXSMM_TLS thread_local
# else
# error Missing TLS support!
# endif
# endif
# else
# if !defined(LIBXSMM_NO_TLS)
# define LIBXSMM_NO_TLS
# endif
# define LIBXSMM_TLS
# endif
#endif
#if !defined(LIBXSMM_GCC_BASELINE) && !defined(LIBXSMM_SYNC_LEGACY) && ((defined(_WIN32) && defined(__clang__)) || \
(defined(__GNUC__) && LIBXSMM_VERSION2(4, 7) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)))
# define LIBXSMM_GCC_BASELINE
#endif
#if defined(__MIC__)
# define LIBXSMM_SYNC_PAUSE _mm_delay_32(8
/*delay*/
)
#elif !defined(LIBXSMM_INTRINSICS_NONE)
# if defined(LIBXSMM_GCC_BASELINE) && !defined(__INTEL_COMPILER)
# define LIBXSMM_SYNC_PAUSE __builtin_ia32_pause()
# else
# define LIBXSMM_SYNC_PAUSE _mm_pause()
# endif
#elif (LIBXSMM_X86_GENERIC <= LIBXSMM_STATIC_TARGET_ARCH) && defined(__GNUC__)
# define LIBXSMM_SYNC_PAUSE __asm__ __volatile__("pause" ::: "memory")
#else
# define LIBXSMM_SYNC_PAUSE
#endif
/* permit thread-unsafe */
#if !defined(LIBXSMM_SYNC_NONE) && ( \
(defined(__PGI) && (!defined(LIBXSMM_LIBATOMIC) || !defined(__STATIC))) || \
(defined(_CRAYC) && !defined(__GNUC__)))
# define LIBXSMM_SYNC_NONE
#endif
#if !defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP) && 0
# define LIBXSMM_ATOMIC_TRYLOCK_CMPSWP
#endif
#if !defined(LIBXSMM_ATOMIC_ZERO_STORE) && defined(_CRAYC)
# define LIBXSMM_ATOMIC_ZERO_STORE
#endif
#if !defined(LIBXSMM_ATOMIC_LOCKTYPE)
# if defined(_WIN32) || 1
/*alignment*/
# define LIBXSMM_ATOMIC_LOCKTYPE int
# else
# define LIBXSMM_ATOMIC_LOCKTYPE char
# endif
#endif
typedef
enum
libxsmm_atomic_kind
{
#if defined(__ATOMIC_SEQ_CST)
LIBXSMM_ATOMIC_SEQ_CST
=
__ATOMIC_SEQ_CST
,
#else
LIBXSMM_ATOMIC_SEQ_CST
=
0
,
#endif
#if defined(__ATOMIC_RELAXED)
LIBXSMM_ATOMIC_RELAXED
=
__ATOMIC_RELAXED
#else
LIBXSMM_ATOMIC_RELAXED
=
LIBXSMM_ATOMIC_SEQ_CST
#endif
}
libxsmm_atomic_kind
;
#define LIBXSMM_NONATOMIC_LOCKTYPE LIBXSMM_ATOMIC_LOCKTYPE
#define LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND) (*(SRC_PTR))
#define LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND) { LIBXSMM_UNUSED(KIND); *(DST_PTR) = (VALUE); }
#define LIBXSMM_NONATOMIC_STORE_ZERO(DST_PTR, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, 0, KIND)
#define LIBXSMM_NONATOMIC_FETCH_OR(DST_PTR, VALUE
/*side-effect*/
, KIND) (
/* 1st step: swap(dst, val) */
\
((*DST_PTR) = (*DST_PTR) ^ (VALUE)), (VALUE = (VALUE) ^ (*DST_PTR)), ((*DST_PTR) = (*DST_PTR) ^ (VALUE)), \
(*(DST_PTR) |= VALUE), (VALUE)
/* 2nd step: or, and 3rd/last step: original dst-value */
)
#define LIBXSMM_NONATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) (*(DST_PTR) += VALUE)
#define LIBXSMM_NONATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) (*(DST_PTR) -= VALUE)
#define LIBXSMM_NONATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) (LIBXSMM_NONATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND), (*(DST_PTR) - (VALUE)))
#define LIBXSMM_NONATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) (LIBXSMM_NONATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND), (*(DST_PTR) + (VALUE)))
#define LIBXSMM_NONATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) ((NEWVAL) == (*(DST_PTR) == (OLDVAL) ? (*(DST_PTR) = (NEWVAL)) : (OLDVAL)))
#define LIBXSMM_NONATOMIC_TRYLOCK(DST_PTR, KIND) LIBXSMM_NONATOMIC_CMPSWP(DST_PTR, 0, 1, KIND)
#define LIBXSMM_NONATOMIC_ACQUIRE(DST_PTR, NPAUSE, KIND) { LIBXSMM_UNUSED(NPAUSE); \
LIBXSMM_ASSERT_MSG(0 == *(DST_PTR), "LIBXSMM_NONATOMIC_ACQUIRE"); LIBXSMM_NONATOMIC_STORE(DST_PTR, 1, KIND); \
LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_NONATOMIC_ACQUIRE"); }
#define LIBXSMM_NONATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_UNUSED(DST_PTR); LIBXSMM_UNUSED(KIND); \
LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_NONATOMIC_RELEASE"); LIBXSMM_NONATOMIC_STORE(DST_PTR, 0, KIND); \
LIBXSMM_ASSERT_MSG(0 == *(DST_PTR), "LIBXSMM_NONATOMIC_RELEASE"); }
#define LIBXSMM_NONATOMIC_SYNC(KIND) LIBXSMM_UNUSED(KIND)
#if (0 == LIBXSMM_SYNC) || defined(LIBXSMM_SYNC_NONE)
# define LIBXSMM_ATOMIC(FN, BITS) FN
# define LIBXSMM_ATOMIC_LOAD LIBXSMM_NONATOMIC_LOAD
# define LIBXSMM_ATOMIC_STORE LIBXSMM_NONATOMIC_STORE
# define LIBXSMM_ATOMIC_STORE_ZERO LIBXSMM_NONATOMIC_STORE_ZERO
# define LIBXSMM_ATOMIC_FETCH_OR LIBXSMM_NONATOMIC_FETCH_OR
# define LIBXSMM_ATOMIC_ADD_FETCH LIBXSMM_NONATOMIC_ADD_FETCH
# define LIBXSMM_ATOMIC_SUB_FETCH LIBXSMM_NONATOMIC_SUB_FETCH
# define LIBXSMM_ATOMIC_FETCH_ADD LIBXSMM_NONATOMIC_FETCH_ADD
# define LIBXSMM_ATOMIC_FETCH_SUB LIBXSMM_NONATOMIC_FETCH_SUB
# define LIBXSMM_ATOMIC_CMPSWP LIBXSMM_NONATOMIC_CMPSWP
# define LIBXSMM_ATOMIC_TRYLOCK LIBXSMM_NONATOMIC_TRYLOCK
# define LIBXSMM_ATOMIC_ACQUIRE LIBXSMM_NONATOMIC_ACQUIRE
# define LIBXSMM_ATOMIC_RELEASE LIBXSMM_NONATOMIC_RELEASE
# define LIBXSMM_ATOMIC_SYNC LIBXSMM_NONATOMIC_SYNC
# if !defined(LIBXSMM_SYNC_NPAUSE)
# define LIBXSMM_SYNC_NPAUSE 0
# endif
#elif (defined(LIBXSMM_GCC_BASELINE) || defined(LIBXSMM_LIBATOMIC)
/* GNU's libatomic required */
|| \
(defined(__GNUC__) && LIBXSMM_VERSION2(4, 1) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)))
# if defined(LIBXSMM_LIBATOMIC)
# define LIBXSMM_ATOMIC(FN, BITS) LIBXSMM_CONCATENATE(LIBXSMM_ATOMIC, BITS)(FN)
# define LIBXSMM_ATOMIC8(FN) LIBXSMM_CONCATENATE(FN, 8)
# define LIBXSMM_ATOMIC16(FN) LIBXSMM_CONCATENATE(FN, 16)
# define LIBXSMM_ATOMIC32(FN) FN
/*default*/
# define LIBXSMM_ATOMIC64(FN) LIBXSMM_CONCATENATE(FN, 64)
# if defined(__PGI)
# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_LOAD8(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_LOAD16(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_LOAD64(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_STORE8(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_STORE16(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_STORE64(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND)
# else
# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) __atomic_load_4(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_LOAD8(SRC_PTR, KIND) __atomic_load_1(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_LOAD16(SRC_PTR, KIND) __atomic_load_2(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_LOAD64(SRC_PTR, KIND) __atomic_load_8(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) __atomic_store_4(DST_PTR, (unsigned int)(VALUE), KIND)
# define LIBXSMM_ATOMIC_STORE8(DST_PTR, VALUE, KIND) __atomic_store_1(DST_PTR, (unsigned char)(VALUE), KIND)
# define LIBXSMM_ATOMIC_STORE16(DST_PTR, VALUE, KIND) __atomic_store_2(DST_PTR, (unsigned short)(VALUE), KIND)
# define LIBXSMM_ATOMIC_STORE64(DST_PTR, VALUE, KIND) __atomic_store_8(DST_PTR, (unsigned long long)(VALUE), KIND)
# endif
# define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) __atomic_fetch_or_4(DST_PTR, (unsigned int)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_OR8(DST_PTR, VALUE, KIND) __atomic_fetch_or_1(DST_PTR, (unsigned char)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_OR16(DST_PTR, VALUE, KIND) __atomic_fetch_or_2(DST_PTR, (unsigned short)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_OR64(DST_PTR, VALUE, KIND) __atomic_fetch_or_8(DST_PTR, (unsigned long long)(VALUE), KIND)
# define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) __atomic_add_fetch_4(DST_PTR, (int)(VALUE), KIND)
# define LIBXSMM_ATOMIC_ADD_FETCH8(DST_PTR, VALUE, KIND) __atomic_add_fetch_1(DST_PTR, (signed char)(VALUE), KIND)
# define LIBXSMM_ATOMIC_ADD_FETCH16(DST_PTR, VALUE, KIND) __atomic_add_fetch_2(DST_PTR, (short)(VALUE), KIND)
# define LIBXSMM_ATOMIC_ADD_FETCH64(DST_PTR, VALUE, KIND) __atomic_add_fetch_8(DST_PTR, (long long)(VALUE), KIND)
# define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) __atomic_sub_fetch_4(DST_PTR, (int)(VALUE), KIND)
# define LIBXSMM_ATOMIC_SUB_FETCH8(DST_PTR, VALUE, KIND) __atomic_sub_fetch_1(DST_PTR, (signed char)(VALUE), KIND)
# define LIBXSMM_ATOMIC_SUB_FETCH16(DST_PTR, VALUE, KIND) __atomic_sub_fetch_2(DST_PTR, (short)(VALUE), KIND)
# define LIBXSMM_ATOMIC_SUB_FETCH64(DST_PTR, VALUE, KIND) __atomic_sub_fetch_8(DST_PTR, (long long)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) __atomic_fetch_add_4(DST_PTR, (int)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_ADD8(DST_PTR, VALUE, KIND) __atomic_fetch_add_1(DST_PTR, (signed char)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, VALUE, KIND) __atomic_fetch_add_2(DST_PTR, (short)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, VALUE, KIND) __atomic_fetch_add_8(DST_PTR, (long long)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) __atomic_fetch_sub_4(DST_PTR, (int)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_SUB8(DST_PTR, VALUE, KIND) __atomic_fetch_sub_1(DST_PTR, (signed char)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_SUB16(DST_PTR, VALUE, KIND) __atomic_fetch_sub_2(DST_PTR, (short)(VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_SUB64(DST_PTR, VALUE, KIND) __atomic_fetch_sub_8(DST_PTR, (long long)(VALUE), KIND)
# define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) \
__atomic_compare_exchange_4(DST_PTR, &(OLDVAL), (NEWVAL), 0
/*false*/
, KIND, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_ATOMIC_CMPSWP8(DST_PTR, OLDVAL, NEWVAL, KIND) \
__atomic_compare_exchange_1(DST_PTR, &(OLDVAL), (NEWVAL), 0
/*false*/
, KIND, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_ATOMIC_CMPSWP16(DST_PTR, OLDVAL, NEWVAL, KIND) \
__atomic_compare_exchange_2(DST_PTR, &(OLDVAL), (NEWVAL), 0
/*false*/
, KIND, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_ATOMIC_CMPSWP64(DST_PTR, OLDVAL, NEWVAL, KIND) \
__atomic_compare_exchange_8(DST_PTR, &(OLDVAL), (NEWVAL), 0
/*false*/
, KIND, LIBXSMM_ATOMIC_RELAXED)
# if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP)
# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (!__atomic_test_and_set(DST_PTR, KIND))
# endif
# if defined(__PGI)
# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \
LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND); }
/* matches bit-width of LIBXSMM_ATOMIC_LOCKTYPE */
# else
# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \
__atomic_clear(DST_PTR, KIND); }
# endif
# define LIBXSMM_ATOMIC_SYNC(KIND) __sync_synchronize()
# if !defined(LIBXSMM_ATOMIC_ZERO_STORE)
# define LIBXSMM_ATOMIC_ZERO_STORE
# endif
# elif defined(LIBXSMM_GCC_BASELINE)
# define LIBXSMM_ATOMIC(FN, BITS) FN
# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) __atomic_load_n(SRC_PTR, KIND)
# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) __atomic_store_n(DST_PTR, VALUE, KIND)
# if !defined(LIBXSMM_ATOMIC_ZERO_STORE)
# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) do {} while (__atomic_and_fetch(DST_PTR, 0, KIND))
# endif
# define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) __atomic_fetch_or(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) __atomic_add_fetch(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) __atomic_sub_fetch(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) __atomic_fetch_add(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) __atomic_fetch_sub(DST_PTR, VALUE, KIND)
# define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) __sync_bool_compare_and_swap(DST_PTR, OLDVAL, NEWVAL)
# if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP)
# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (!__atomic_test_and_set(DST_PTR, KIND))
# endif
# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \
__atomic_clear(DST_PTR, KIND); }
# if 0
/* __atomic_thread_fence: incorrect behavior in libxsmm_barrier (even with LIBXSMM_ATOMIC_SEQ_CST) */
# define LIBXSMM_ATOMIC_SYNC(KIND) __atomic_thread_fence(KIND)
# else
# define LIBXSMM_ATOMIC_SYNC(KIND) __sync_synchronize()
# endif
# else
/* GCC legacy atomics */
# define LIBXSMM_ATOMIC(FN, BITS) FN
# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) __sync_or_and_fetch(SRC_PTR, 0)
# if (LIBXSMM_X86_GENERIC <= LIBXSMM_STATIC_TARGET_ARCH)
# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) { \
__asm__ __volatile__("" ::: "memory"); *(DST_PTR) = (VALUE); \
__asm__ __volatile__("" ::: "memory"); }
# else
# define LIBXSMM_ATOMIC_SYNC_NOFENCE(KIND)
# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) *(DST_PTR) = (VALUE)
# endif
# if !defined(LIBXSMM_ATOMIC_ZERO_STORE)
# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) do {} while (__sync_and_and_fetch(DST_PTR, 0))
# endif
# define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) __sync_fetch_and_or(DST_PTR, VALUE)
# define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) __sync_add_and_fetch(DST_PTR, VALUE)
# define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) __sync_sub_and_fetch(DST_PTR, VALUE)
# define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) __sync_fetch_and_add(DST_PTR, VALUE)
# define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) __sync_fetch_and_sub(DST_PTR, VALUE)
# define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) __sync_bool_compare_and_swap(DST_PTR, OLDVAL, NEWVAL)
# if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP)
# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (0 == __sync_lock_test_and_set(DST_PTR, 1))
# endif
# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \
__sync_lock_release(DST_PTR); }
# define LIBXSMM_ATOMIC_SYNC(KIND) __sync_synchronize()
# endif
# if defined(LIBXSMM_ATOMIC_ZERO_STORE)
# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE(DST_PTR, 0, KIND)
# define LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, 8)(DST_PTR, 0, KIND)
# define LIBXSMM_ATOMIC_STORE_ZERO16(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, 16)(DST_PTR, 0, KIND)
# define LIBXSMM_ATOMIC_STORE_ZERO64(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, 64)(DST_PTR, 0, KIND)
# endif
# if !defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP)
# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND)
/* matches bit-width of LIBXSMM_ATOMIC_LOCKTYPE */
\
(0 == LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_OR, 8)(DST_PTR, 1, KIND))
# endif
# define LIBXSMM_ATOMIC_ACQUIRE(DST_PTR, NPAUSE, KIND) \
LIBXSMM_ASSERT(0 == LIBXSMM_MOD2((uintptr_t)(DST_PTR), 4)); \
while (!LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND)) LIBXSMM_SYNC_CYCLE(DST_PTR, 0
/*free*/
, NPAUSE); \
LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_ACQUIRE")
# if !defined(LIBXSMM_SYNC_NPAUSE)
# define LIBXSMM_SYNC_NPAUSE 4096
# endif
#elif defined(_WIN32)
# define LIBXSMM_ATOMIC(FN, BITS) LIBXSMM_CONCATENATE(LIBXSMM_ATOMIC, BITS)(FN)
# define LIBXSMM_ATOMIC8(FN) LIBXSMM_CONCATENATE(FN, 8)
# define LIBXSMM_ATOMIC16(FN) LIBXSMM_CONCATENATE(FN, 16)
# define LIBXSMM_ATOMIC32(FN) FN
/*default*/
# define LIBXSMM_ATOMIC64(FN) LIBXSMM_CONCATENATE(FN, 64)
# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) InterlockedOr((volatile LONG*)(SRC_PTR), 0)
# define LIBXSMM_ATOMIC_LOAD8(SRC_PTR, KIND) _InterlockedOr8((volatile char*)(SRC_PTR), 0)
# define LIBXSMM_ATOMIC_LOAD64(SRC_PTR, KIND) InterlockedOr64((volatile LONGLONG*)(SRC_PTR), 0)
# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) InterlockedExchange((volatile LONG*)(DST_PTR), (LONG)(VALUE))
# define LIBXSMM_ATOMIC_STORE8(DST_PTR, VALUE, KIND) InterlockedExchange8((volatile char*)(DST_PTR), (LONGLONG)(VALUE))
# define LIBXSMM_ATOMIC_STORE64(DST_PTR, VALUE, KIND) InterlockedExchange64((volatile LONGLONG*)(DST_PTR), (LONGLONG)(VALUE))
# if defined(LIBXSMM_ATOMIC_ZERO_STORE)
# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE(DST_PTR, 0, KIND)
# define LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE8(DST_PTR, 0, KIND)
# define LIBXSMM_ATOMIC_STORE_ZERO64(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE64(DST_PTR, 0, KIND)
# else
# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) InterlockedAnd((volatile LONG*)(DST_PTR), 0)
# define LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND) InterlockedAnd8((volatile char*)(DST_PTR), 0)
# define LIBXSMM_ATOMIC_STORE_ZERO64(DST_PTR, KIND) InterlockedAnd64((volatile LONGLONG*)(DST_PTR), 0)
# endif
# define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) InterlockedOr((volatile LONG*)(DST_PTR), VALUE)
# define LIBXSMM_ATOMIC_FETCH_OR8(DST_PTR, VALUE, KIND) _InterlockedOr8((volatile char*)(DST_PTR), VALUE)
# define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) + (VALUE))
# define LIBXSMM_ATOMIC_ADD_FETCH16(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, VALUE, KIND) + (VALUE))
# define LIBXSMM_ATOMIC_ADD_FETCH64(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, VALUE, KIND) + (VALUE))
# define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) ((size_t)LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) - ((size_t)VALUE))
# define LIBXSMM_ATOMIC_SUB_FETCH16(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_SUB16(DST_PTR, VALUE, KIND) - (VALUE))
# define LIBXSMM_ATOMIC_SUB_FETCH64(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_SUB64(DST_PTR, VALUE, KIND) - (VALUE))
# define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) InterlockedExchangeAdd((volatile LONG*)(DST_PTR), VALUE)
# define LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, VALUE, KIND) _InterlockedExchangeAdd16((volatile SHORT*)(DST_PTR), VALUE)
# define LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, VALUE, KIND) InterlockedExchangeAdd64((volatile LONGLONG*)(DST_PTR), VALUE)
# define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, -1 * (VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_SUB16(DST_PTR, VALUE, KIND) LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, -1 * (VALUE), KIND)
# define LIBXSMM_ATOMIC_FETCH_SUB64(DST_PTR, VALUE, KIND) LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, -1 * (VALUE), KIND)
# define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) (((LONG)(OLDVAL)) == InterlockedCompareExchange((volatile LONG*)(DST_PTR), NEWVAL, OLDVAL))
# define LIBXSMM_ATOMIC_CMPSWP8(DST_PTR, OLDVAL, NEWVAL, KIND) ((OLDVAL) == _InterlockedCompareExchange8((volatile char*)(DST_PTR), NEWVAL, OLDVAL))
# if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP)
# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_CMPSWP, 8)(DST_PTR, 0, 1, KIND)
# else
# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (0 == LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_OR, 8)(DST_PTR, 1, KIND))
# endif
# define LIBXSMM_ATOMIC_ACQUIRE(DST_PTR, NPAUSE, KIND) \
LIBXSMM_ASSERT(0 == LIBXSMM_MOD2((uintptr_t)(DST_PTR), 4)); \
while (!LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND)) LIBXSMM_SYNC_CYCLE(DST_PTR, 0
/*free*/
, NPAUSE); \
LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_ACQUIRE")
# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { \
LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \
LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE_ZERO, 8)(DST_PTR, KIND); }
# define LIBXSMM_ATOMIC_SYNC(KIND) _ReadWriteBarrier()
# if !defined(LIBXSMM_SYNC_NPAUSE)
# define LIBXSMM_SYNC_NPAUSE 4096
# endif
#else
/* consider to permit LIBXSMM_SYNC_NONE */
# error LIBXSMM is missing atomic compiler builtins!
#endif
#if !defined(LIBXSMM_SYNC_CYCLE)
# if (0 < LIBXSMM_SYNC_NPAUSE)
# define LIBXSMM_SYNC_CYCLE_ELSE(DST_PTR, EXP_STATE, NPAUSE, ELSE) do { int libxsmm_sync_cycle_npause_ = 1; \
do { int libxsmm_sync_cycle_counter_ = 0; \
for (; libxsmm_sync_cycle_counter_ < libxsmm_sync_cycle_npause_; ++libxsmm_sync_cycle_counter_) LIBXSMM_SYNC_PAUSE; \
if (libxsmm_sync_cycle_npause_ < (NPAUSE)) { \
libxsmm_sync_cycle_npause_ *= 2; \
} \
else { \
libxsmm_sync_cycle_npause_ = (NPAUSE); \
LIBXSMM_SYNC_YIELD; \
ELSE \
} \
} while(((EXP_STATE) & 1) != (*(DST_PTR) & 1)); \
} while(0)
# else
# define LIBXSMM_SYNC_CYCLE_ELSE(DST_PTR, EXP_STATE, NPAUSE, ELSE) LIBXSMM_SYNC_PAUSE
# endif
# define LIBXSMM_SYNC_CYCLE(DST_PTR, EXP_STATE, NPAUSE) \
LIBXSMM_SYNC_CYCLE_ELSE(DST_PTR, EXP_STATE, NPAUSE,
/*else*/
;)
#endif
#if (0 != LIBXSMM_SYNC)
# define LIBXSMM_LOCK_DEFAULT LIBXSMM_LOCK_SPINLOCK
# if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) && !(defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP)) && \
(!defined(__linux__) || defined(__USE_XOPEN2K)) && 0
/*disabled*/
# define LIBXSMM_LOCK_SYSTEM_SPINLOCK
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX) && !(defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP))
# define LIBXSMM_LOCK_SYSTEM_MUTEX
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) && !(defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP)) && \
(!defined(__linux__) || defined(__USE_XOPEN2K) || defined(__USE_UNIX98))
# define LIBXSMM_LOCK_SYSTEM_RWLOCK
# endif
/* Lock type, initialization, destruction, (try-)lock, unlock, etc */
# define LIBXSMM_LOCK_ACQUIRED(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ACQUIRED_, KIND)
# define LIBXSMM_LOCK_TYPE_ISPOD(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TYPE_ISPOD_, KIND)
# define LIBXSMM_LOCK_TYPE_ISRW(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TYPE_ISRW_, KIND)
# define LIBXSMM_LOCK_TYPE(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TYPE_, KIND)
# define LIBXSMM_LOCK_INIT(KIND, LOCK, ATTR) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_INIT_, KIND)(LOCK, ATTR)
# define LIBXSMM_LOCK_DESTROY(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_DESTROY_, KIND)(LOCK)
# define LIBXSMM_LOCK_TRYLOCK(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TRYLOCK_, KIND)(LOCK)
# define LIBXSMM_LOCK_ACQUIRE(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ACQUIRE_, KIND)(LOCK)
# define LIBXSMM_LOCK_RELEASE(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_RELEASE_, KIND)(LOCK)
# define LIBXSMM_LOCK_TRYREAD(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TRYREAD_, KIND)(LOCK)
# define LIBXSMM_LOCK_ACQREAD(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ACQREAD_, KIND)(LOCK)
# define LIBXSMM_LOCK_RELREAD(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_RELREAD_, KIND)(LOCK)
/* Attribute type, initialization, destruction */
# define LIBXSMM_LOCK_ATTR_TYPE(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ATTR_TYPE_, KIND)
# define LIBXSMM_LOCK_ATTR_INIT(KIND, ATTR) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ATTR_INIT_, KIND)(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY(KIND, ATTR) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ATTR_DESTROY_, KIND)(ATTR)
/* Cygwin's Pthread implementation appears to be broken; use Win32 */
# if !defined(LIBXSMM_WIN32_THREADS) && (defined(_WIN32) || defined(__CYGWIN__))
# define LIBXSMM_WIN32_THREADS _WIN32_WINNT
# if defined(__CYGWIN__) || defined(__MINGW32__)
/* hack: make SRW-locks available */
# if defined(_WIN32_WINNT)
# undef _WIN32_WINNT
# if !defined(NTDDI_VERSION)
# define NTDDI_VERSION 0x0600
# endif
# define _WIN32_WINNT ((LIBXSMM_WIN32_THREADS) | 0x0600)
# else
# define _WIN32_WINNT 0x0600
# endif
# endif
# endif
# if defined(LIBXSMM_WIN32_THREADS)
# define LIBXSMM_TLS_TYPE DWORD
# define LIBXSMM_TLS_CREATE(KEYPTR) *(KEYPTR) = TlsAlloc()
# define LIBXSMM_TLS_DESTROY(KEY) TlsFree(KEY)
# define LIBXSMM_TLS_SETVALUE(KEY, PTR) TlsSetValue(KEY, PTR)
# define LIBXSMM_TLS_GETVALUE(KEY) TlsGetValue(KEY)
# define LIBXSMM_LOCK_SPINLOCK spin
# if ((LIBXSMM_WIN32_THREADS) & 0x0600)
# define LIBXSMM_LOCK_MUTEX rwlock
# define LIBXSMM_LOCK_RWLOCK rwlock
# else
/* mutex exposes high latency */
# define LIBXSMM_LOCK_MUTEX mutex
# define LIBXSMM_LOCK_RWLOCK mutex
# endif
# if defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK)
# define LIBXSMM_LOCK_ACQUIRED_spin TRUE
# define LIBXSMM_LOCK_TYPE_ISPOD_spin 0
# define LIBXSMM_LOCK_TYPE_spin CRITICAL_SECTION
# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); InitializeCriticalSection(LOCK); }
# define LIBXSMM_LOCK_DESTROY_spin(LOCK) DeleteCriticalSection((LIBXSMM_LOCK_TYPE_spin*)(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) TryEnterCriticalSection(LOCK)
# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) EnterCriticalSection(LOCK)
# define LIBXSMM_LOCK_RELEASE_spin(LOCK) LeaveCriticalSection(LOCK)
# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK)
# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK)
# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_spin int
# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if defined(LIBXSMM_LOCK_SYSTEM_MUTEX)
# define LIBXSMM_LOCK_ACQUIRED_mutex WAIT_OBJECT_0
# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0
# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0
# define LIBXSMM_LOCK_TYPE_mutex HANDLE
# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) (*(LOCK) = CreateMutex(*(ATTR), FALSE, NULL))
# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) CloseHandle(*(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) WaitForSingleObject(*(LOCK), 0)
# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) WaitForSingleObject(*(LOCK), INFINITE)
# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) ReleaseMutex(*(LOCK))
# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK)
# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK)
# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_mutex LPSECURITY_ATTRIBUTES
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (*(ATTR) = NULL)
# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if defined(LIBXSMM_LOCK_SYSTEM_RWLOCK)
# define LIBXSMM_LOCK_ACQUIRED_rwlock TRUE
# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 1
# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 1
# define LIBXSMM_LOCK_TYPE_rwlock SRWLOCK
# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); InitializeSRWLock(LOCK); }
# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) LIBXSMM_UNUSED(LOCK)
# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) TryAcquireSRWLockExclusive(LOCK)
# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) AcquireSRWLockExclusive(LOCK)
# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) ReleaseSRWLockExclusive(LOCK)
# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) TryAcquireSRWLockShared(LOCK)
# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) AcquireSRWLockShared(LOCK)
# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) ReleaseSRWLockShared(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_rwlock int
# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# define LIBXSMM_SYNC_YIELD YieldProcessor()
# else
# define LIBXSMM_TLS_TYPE pthread_key_t
# define LIBXSMM_TLS_CREATE(KEYPTR) pthread_key_create(KEYPTR, NULL)
# define LIBXSMM_TLS_DESTROY(KEY) pthread_key_delete(KEY)
# define LIBXSMM_TLS_SETVALUE(KEY, PTR) pthread_setspecific(KEY, PTR)
# define LIBXSMM_TLS_GETVALUE(KEY) pthread_getspecific(KEY)
# if defined(__APPLE__) && defined(__MACH__)
# define LIBXSMM_SYNC_YIELD pthread_yield_np()
# else
# if defined(__USE_GNU) || !defined(__BSD_VISIBLE)
LIBXSMM_EXTERN
int
pthread_yield
(
void
)
LIBXSMM_THROW
;
# else
LIBXSMM_EXTERN
void
pthread_yield
(
void
);
# endif
# define LIBXSMM_SYNC_YIELD pthread_yield()
# endif
# if defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) && defined(__APPLE__) && defined(__MACH__)
# define LIBXSMM_LOCK_SPINLOCK mutex
# else
# define LIBXSMM_LOCK_SPINLOCK spin
# endif
# define LIBXSMM_LOCK_MUTEX mutex
# define LIBXSMM_LOCK_RWLOCK rwlock
# if defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK)
# define LIBXSMM_LOCK_ACQUIRED_spin 0
# define LIBXSMM_LOCK_TYPE_ISPOD_spin 0
# define LIBXSMM_LOCK_TYPE_ISRW_spin 0
# define LIBXSMM_LOCK_TYPE_spin pthread_spinlock_t
# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) LIBXSMM_EXPECT(0, pthread_spin_init(LOCK, *(ATTR)))
# define LIBXSMM_LOCK_DESTROY_spin(LOCK) LIBXSMM_EXPECT(0, pthread_spin_destroy(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) pthread_spin_trylock(LOCK)
# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) LIBXSMM_EXPECT(0, pthread_spin_lock(LOCK))
# define LIBXSMM_LOCK_RELEASE_spin(LOCK) LIBXSMM_EXPECT(0, pthread_spin_unlock(LOCK))
# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK)
# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK)
# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_spin int
# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) (*(ATTR) = 0)
# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if defined(LIBXSMM_LOCK_SYSTEM_MUTEX)
# define LIBXSMM_LOCK_ACQUIRED_mutex 0
# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0
# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0
# define LIBXSMM_LOCK_TYPE_mutex pthread_mutex_t
# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) LIBXSMM_EXPECT(0, pthread_mutex_init(LOCK, ATTR))
# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) LIBXSMM_EXPECT_DEBUG(0, pthread_mutex_destroy(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) pthread_mutex_trylock(LOCK)
/*!LIBXSMM_EXPECT*/
# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) LIBXSMM_EXPECT(0, pthread_mutex_lock(LOCK))
# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) LIBXSMM_EXPECT(0, pthread_mutex_unlock(LOCK))
# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK)
# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK)
# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_mutex pthread_mutexattr_t
#if !defined(__linux__) || defined(__USE_UNIX98) || defined(__USE_XOPEN2K8)
# if defined(_DEBUG)
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (LIBXSMM_EXPECT(0, pthread_mutexattr_init(ATTR)), \
LIBXSMM_EXPECT(0, pthread_mutexattr_settype(ATTR, PTHREAD_MUTEX_ERRORCHECK)))
# else
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (pthread_mutexattr_init(ATTR), \
pthread_mutexattr_settype(ATTR, PTHREAD_MUTEX_NORMAL))
# endif
#else
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) pthread_mutexattr_init(ATTR)
#endif
# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_EXPECT(0, pthread_mutexattr_destroy(ATTR))
# endif
# if defined(LIBXSMM_LOCK_SYSTEM_RWLOCK)
# define LIBXSMM_LOCK_ACQUIRED_rwlock 0
# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 0
# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 1
# define LIBXSMM_LOCK_TYPE_rwlock pthread_rwlock_t
# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) LIBXSMM_EXPECT(0, pthread_rwlock_init(LOCK, ATTR))
# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_destroy(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) pthread_rwlock_trywrlock(LOCK)
# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_wrlock(LOCK))
# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_unlock(LOCK))
# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) pthread_rwlock_tryrdlock(LOCK)
# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_rdlock(LOCK))
# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) LIBXSMM_LOCK_RELEASE_rwlock(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_rwlock pthread_rwlockattr_t
# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_EXPECT(0, pthread_rwlockattr_init(ATTR))
# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_EXPECT(0, pthread_rwlockattr_destroy(ATTR))
# endif
# endif
/* OpenMP based locks need to stay disabled unless both
* libxsmm and libxsmmext are built with OpenMP support.
*/
# if defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP)
# if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK)
# define LIBXSMM_LOCK_ACQUIRED_spin 1
# define LIBXSMM_LOCK_TYPE_ISPOD_spin 0
# define LIBXSMM_LOCK_TYPE_ISRW_spin 0
# define LIBXSMM_LOCK_TYPE_spin omp_lock_t
# define LIBXSMM_LOCK_DESTROY_spin(LOCK) omp_destroy_lock(LOCK)
# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) omp_test_lock(LOCK)
# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) omp_set_lock(LOCK)
# define LIBXSMM_LOCK_RELEASE_spin(LOCK) omp_unset_lock(LOCK)
# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK)
# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK)
# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK)
# if (201811 <= _OPENMP
/*v5.0*/
)
# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) omp_init_lock_with_hint(LOCK, *(ATTR))
# define LIBXSMM_LOCK_ATTR_TYPE_spin omp_lock_hint_t
# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) (*(ATTR) = omp_lock_hint_none)
# else
# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); omp_init_lock(LOCK); }
# define LIBXSMM_LOCK_ATTR_TYPE_spin const void*
# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX)
# define LIBXSMM_LOCK_ACQUIRED_mutex 1
# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0
# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0
# define LIBXSMM_LOCK_TYPE_mutex omp_lock_t
# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) omp_destroy_lock(LOCK)
# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) omp_test_lock(LOCK)
# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) omp_set_lock(LOCK)
# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) omp_unset_lock(LOCK)
# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK)
# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK)
# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK)
# if (201811 <= _OPENMP
/*v5.0*/
)
# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) omp_init_lock_with_hint(LOCK, *(ATTR))
# define LIBXSMM_LOCK_ATTR_TYPE_mutex omp_lock_hint_t
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (*(ATTR) = omp_lock_hint_none)
# else
# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); omp_init_lock(LOCK); }
# define LIBXSMM_LOCK_ATTR_TYPE_mutex const void*
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK)
# define LIBXSMM_LOCK_ACQUIRED_rwlock 1
# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 0
# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 0
# define LIBXSMM_LOCK_TYPE_rwlock omp_lock_t
# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) omp_destroy_lock(LOCK)
# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) omp_test_lock(LOCK)
# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) omp_set_lock(LOCK)
# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) omp_unset_lock(LOCK)
# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK)
# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK)
# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) LIBXSMM_LOCK_RELEASE_rwlock(LOCK)
# if (201811 <= _OPENMP
/*v5.0*/
)
# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) omp_init_lock_with_hint(LOCK, *(ATTR))
# define LIBXSMM_LOCK_ATTR_TYPE_rwlock omp_lock_hint_t
# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) (*(ATTR) = omp_lock_hint_none)
# else
# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); omp_init_lock(LOCK); }
# define LIBXSMM_LOCK_ATTR_TYPE_rwlock const void*
# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# elif !defined(LIBXSMM_SYNC_NONE)
/* based on atomic primitives */
# if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK)
# define LIBXSMM_LOCK_ACQUIRED_spin 0
# define LIBXSMM_LOCK_TYPE_ISPOD_spin 1
# define LIBXSMM_LOCK_TYPE_ISRW_spin 0
# define LIBXSMM_LOCK_TYPE_spin volatile LIBXSMM_ATOMIC_LOCKTYPE
# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = 0); }
# define LIBXSMM_LOCK_DESTROY_spin(LOCK) LIBXSMM_UNUSED(LOCK)
# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) (LIBXSMM_LOCK_ACQUIRED_spin + !LIBXSMM_ATOMIC_TRYLOCK(LOCK, LIBXSMM_ATOMIC_RELAXED))
# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) LIBXSMM_ATOMIC_ACQUIRE(LOCK, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_LOCK_RELEASE_spin(LOCK) LIBXSMM_ATOMIC_RELEASE(LOCK, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK)
# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK)
# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_spin int
# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX)
# define LIBXSMM_LOCK_ACQUIRED_mutex 0
# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 1
# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0
# define LIBXSMM_LOCK_TYPE_mutex volatile LIBXSMM_ATOMIC_LOCKTYPE
# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = 0); }
# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) LIBXSMM_UNUSED(LOCK)
# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) (LIBXSMM_LOCK_ACQUIRED_mutex + !LIBXSMM_ATOMIC_TRYLOCK(LOCK, LIBXSMM_ATOMIC_RELAXED))
# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) LIBXSMM_ATOMIC_ACQUIRE(LOCK, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) LIBXSMM_ATOMIC_RELEASE(LOCK, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK)
# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK)
# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_mutex int
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK)
# define LIBXSMM_LOCK_ACQUIRED_rwlock 0
# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 1
# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 0
# define LIBXSMM_LOCK_TYPE_rwlock volatile LIBXSMM_ATOMIC_LOCKTYPE
# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = 0); }
# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) LIBXSMM_UNUSED(LOCK)
# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) (LIBXSMM_LOCK_ACQUIRED_rwlock + !LIBXSMM_ATOMIC_TRYLOCK(LOCK, LIBXSMM_ATOMIC_RELAXED))
# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) LIBXSMM_ATOMIC_ACQUIRE(LOCK, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) LIBXSMM_ATOMIC_RELEASE(LOCK, LIBXSMM_ATOMIC_RELAXED)
# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK)
# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK)
# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) LIBXSMM_LOCK_RELEASE_rwlock(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_rwlock int
# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# else
/* experimental */
# if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK)
# define LIBXSMM_LOCK_ACQUIRED_spin 0
# define LIBXSMM_LOCK_TYPE_ISPOD_spin 0
# define LIBXSMM_LOCK_TYPE_ISRW_spin 0
# define LIBXSMM_LOCK_TYPE_spin libxsmm_spinlock*
# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = libxsmm_spinlock_create()); }
# define LIBXSMM_LOCK_DESTROY_spin(LOCK) libxsmm_spinlock_destroy(*(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) libxsmm_spinlock_trylock(*(LOCK))
# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) libxsmm_spinlock_acquire(*(LOCK))
# define LIBXSMM_LOCK_RELEASE_spin(LOCK) libxsmm_spinlock_release(*(LOCK))
# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK)
# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK)
# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_spin int
# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX)
# define LIBXSMM_LOCK_ACQUIRED_mutex 0
# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0
# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0
# define LIBXSMM_LOCK_TYPE_mutex libxsmm_mutex*
# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = libxsmm_mutex_create()); }
# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) libxsmm_mutex_destroy(*(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) libxsmm_mutex_trylock(*(LOCK))
# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) libxsmm_mutex_acquire(*(LOCK))
# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) libxsmm_mutex_release(*(LOCK))
# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK)
# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK)
# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK)
# define LIBXSMM_LOCK_ATTR_TYPE_mutex int
# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK)
# define LIBXSMM_LOCK_ACQUIRED_rwlock 0
# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 0
# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 1
# define LIBXSMM_LOCK_TYPE_rwlock libxsmm_rwlock*
# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = libxsmm_rwlock_create()); }
# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) libxsmm_rwlock_destroy(*(LOCK))
# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) libxsmm_rwlock_trylock(*(LOCK))
# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) libxsmm_rwlock_acquire(*(LOCK))
# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) libxsmm_rwlock_release(*(LOCK))
# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) libxsmm_rwlock_tryread(*(LOCK))
# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) libxsmm_rwlock_acqread(*(LOCK))
# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) libxsmm_rwlock_relread(*(LOCK))
# define LIBXSMM_LOCK_ATTR_TYPE_rwlock int
# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR)
# endif
# endif
#else
/* no synchronization */
# define LIBXSMM_SYNC_YIELD LIBXSMM_SYNC_PAUSE
# define LIBXSMM_LOCK_SPINLOCK spinlock_dummy
# define LIBXSMM_LOCK_MUTEX mutex_dummy
# define LIBXSMM_LOCK_RWLOCK rwlock_dummy
# define LIBXSMM_LOCK_ACQUIRED(KIND) 0
# define LIBXSMM_LOCK_TYPE_ISPOD(KIND) 1
# define LIBXSMM_LOCK_TYPE_ISRW(KIND) 0
# define LIBXSMM_LOCK_ATTR_TYPE(KIND) int
# define LIBXSMM_LOCK_ATTR_INIT(KIND, ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_ATTR_DESTROY(KIND, ATTR) LIBXSMM_UNUSED(ATTR)
# define LIBXSMM_LOCK_TYPE(KIND) int
# define LIBXSMM_LOCK_INIT(KIND, LOCK, ATTR) { LIBXSMM_UNUSED(LOCK); LIBXSMM_UNUSED(ATTR); }
# define LIBXSMM_LOCK_DESTROY(KIND, LOCK) LIBXSMM_UNUSED(LOCK)
# define LIBXSMM_LOCK_TRYLOCK(KIND, LOCK) LIBXSMM_LOCK_ACQUIRED(KIND)
# define LIBXSMM_LOCK_ACQUIRE(KIND, LOCK) LIBXSMM_UNUSED(LOCK)
# define LIBXSMM_LOCK_RELEASE(KIND, LOCK) LIBXSMM_UNUSED(LOCK)
# define LIBXSMM_LOCK_TRYREAD(KIND, LOCK) LIBXSMM_LOCK_TRYLOCK(KIND, LOCK)
# define LIBXSMM_LOCK_ACQREAD(KIND, LOCK) LIBXSMM_LOCK_ACQUIRE(KIND, LOCK)
# define LIBXSMM_LOCK_RELREAD(KIND, LOCK) LIBXSMM_LOCK_RELEASE(KIND, LOCK)
#endif
#if (0 == LIBXSMM_SYNC)
# define LIBXSMM_FLOCK(FILE)
# define LIBXSMM_FUNLOCK(FILE)
#elif defined(_WIN32)
# define LIBXSMM_FLOCK(FILE) _lock_file(FILE)
# define LIBXSMM_FUNLOCK(FILE) _unlock_file(FILE)
#else
# if !defined(__CYGWIN__)
# define LIBXSMM_FLOCK(FILE) flockfile(FILE)
# define LIBXSMM_FUNLOCK(FILE) funlockfile(FILE)
LIBXSMM_EXTERN
void
flockfile
(
FILE
*
)
LIBXSMM_THROW
;
LIBXSMM_EXTERN
void
funlockfile
(
FILE
*
)
LIBXSMM_THROW
;
# else
/* Only available with __CYGWIN__ *and* C++0x. */
# define LIBXSMM_FLOCK(FILE)
# define LIBXSMM_FUNLOCK(FILE)
# endif
#endif
/** Synchronize console output */
#define LIBXSMM_STDIO_ACQUIRE() LIBXSMM_FLOCK(stdout); LIBXSMM_FLOCK(stderr)
#define LIBXSMM_STDIO_RELEASE() LIBXSMM_FUNLOCK(stderr); LIBXSMM_FUNLOCK(stdout)
/** Opaque type which represents a barrier. */
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_barrier
libxsmm_barrier
;
/** Create barrier from one of the threads. */
LIBXSMM_API
libxsmm_barrier
*
libxsmm_barrier_create
(
int
ncores
,
int
nthreads_per_core
);
/** Initialize the barrier from each thread of the team. */
LIBXSMM_API
void
libxsmm_barrier_init
(
libxsmm_barrier
*
barrier
,
int
tid
);
/** Wait for the entire team to arrive. */
LIBXSMM_API
void
libxsmm_barrier_wait
(
libxsmm_barrier
*
barrier
,
int
tid
);
/** Destroy the resources associated with this barrier. */
LIBXSMM_API
void
libxsmm_barrier_destroy
(
const
libxsmm_barrier
*
barrier
);
/** DEPRECATED: use libxsmm_barrier_destroy instead. */
#define libxsmm_barrier_release libxsmm_barrier_destroy
/** Spin-lock, which eventually differs from LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_SPINLOCK). */
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_spinlock
libxsmm_spinlock
;
LIBXSMM_API
libxsmm_spinlock
*
libxsmm_spinlock_create
(
void
);
LIBXSMM_API
void
libxsmm_spinlock_destroy
(
const
libxsmm_spinlock
*
spinlock
);
LIBXSMM_API
int
libxsmm_spinlock_trylock
(
libxsmm_spinlock
*
spinlock
);
LIBXSMM_API
void
libxsmm_spinlock_acquire
(
libxsmm_spinlock
*
spinlock
);
LIBXSMM_API
void
libxsmm_spinlock_release
(
libxsmm_spinlock
*
spinlock
);
/** Mutual-exclusive lock (Mutex), which eventually differs from LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_MUTEX). */
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_mutex
libxsmm_mutex
;
LIBXSMM_API
libxsmm_mutex
*
libxsmm_mutex_create
(
void
);
LIBXSMM_API
void
libxsmm_mutex_destroy
(
const
libxsmm_mutex
*
mutex
);
LIBXSMM_API
int
libxsmm_mutex_trylock
(
libxsmm_mutex
*
mutex
);
LIBXSMM_API
void
libxsmm_mutex_acquire
(
libxsmm_mutex
*
mutex
);
LIBXSMM_API
void
libxsmm_mutex_release
(
libxsmm_mutex
*
mutex
);
/** Reader-Writer lock (RW-lock), which eventually differs from LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_RWLOCK). */
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_rwlock
libxsmm_rwlock
;
LIBXSMM_API
libxsmm_rwlock
*
libxsmm_rwlock_create
(
void
);
LIBXSMM_API
void
libxsmm_rwlock_destroy
(
const
libxsmm_rwlock
*
rwlock
);
LIBXSMM_API
int
libxsmm_rwlock_trylock
(
libxsmm_rwlock
*
rwlock
);
LIBXSMM_API
void
libxsmm_rwlock_acquire
(
libxsmm_rwlock
*
rwlock
);
LIBXSMM_API
void
libxsmm_rwlock_release
(
libxsmm_rwlock
*
rwlock
);
LIBXSMM_API
int
libxsmm_rwlock_tryread
(
libxsmm_rwlock
*
rwlock
);
LIBXSMM_API
void
libxsmm_rwlock_acqread
(
libxsmm_rwlock
*
rwlock
);
LIBXSMM_API
void
libxsmm_rwlock_relread
(
libxsmm_rwlock
*
rwlock
);
/** Utility function to receive the process ID of the calling process. */
LIBXSMM_API
unsigned
int
libxsmm_get_pid
(
void
);
/**
* Utility function to receive a Thread-ID (TID) for the calling thread.
* The TID is not related to a specific threading runtime. TID=0 may not
* represent the main thread. TIDs are zero-based and consecutive numbers.
*/
LIBXSMM_API
unsigned
int
libxsmm_get_tid
(
void
);
#endif
/*LIBXSMM_SYNC_H*/
third_party/libxsmm/include/libxsmm_timer.h
0 → 100644
View file @
c454d419
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Hans Pabst (Intel Corp.)
******************************************************************************/
#ifndef LIBXSMM_TIMER_H
#define LIBXSMM_TIMER_H
#include "libxsmm_macros.h"
typedef
unsigned
long
long
libxsmm_timer_tickint
;
LIBXSMM_EXTERN_C
typedef
struct
LIBXSMM_RETARGETABLE
libxsmm_timer_info
{
int
tsc
;
}
libxsmm_timer_info
;
/** Query timer properties. */
LIBXSMM_API
int
libxsmm_get_timer_info
(
libxsmm_timer_info
*
info
);
/**
* Returns the current clock tick of a monotonic timer source with
* platform-specific resolution (not necessarily CPU cycles).
*/
LIBXSMM_API
libxsmm_timer_tickint
libxsmm_timer_tick
(
void
);
/** Returns the difference between two timer ticks (cycles); avoids potential side-effects/assumptions of LIBXSMM_DIFF. */
LIBXSMM_API_INLINE
libxsmm_timer_tickint
libxsmm_timer_ncycles
(
libxsmm_timer_tickint
tick0
,
libxsmm_timer_tickint
tick1
)
{
return
LIBXSMM_DELTA
(
tick0
,
tick1
);
}
/** Returns the duration (in seconds) between two values received by libxsmm_timer_tick. */
LIBXSMM_API
double
libxsmm_timer_duration
(
libxsmm_timer_tickint
tick0
,
libxsmm_timer_tickint
tick1
);
#endif
/*LIBXSMM_TIMER_H*/
Prev
1
2
3
4
5
6
7
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment