删除子模块的gitignore

c454d419 · lisj · 3359c1f1 · c454d419 · c454d419 · c454d419
Commit c454d419 authored May 12, 2023 by lisj
20 changed files
--- a/third_party/libxsmm/documentation/libxsmm_magazine.docx
+++ b/third_party/libxsmm/documentation/libxsmm_magazine.docx
--- a/third_party/libxsmm/documentation/libxsmm_mm.docx
+++ b/third_party/libxsmm/documentation/libxsmm_mm.docx
--- a/third_party/libxsmm/documentation/libxsmm_mm.md
+++ b/third_party/libxsmm/documentation/libxsmm_mm.md
+## Matrix Multiplication
+
+### Overview<a name="small-matrix-multiplication-smm"></a>
+
+To perform the dense matrix-matrix multiplication <i>C<sub>m&#8239;x&#8239;n</sub> = alpha &middot; A<sub>m&#8239;x&#8239;k</sub> &middot; B<sub>k&#8239;x&#8239;n</sub> + beta &middot; C<sub>m&#8239;x&#8239;n</sub></i>, the full-blown GEMM interface can be treated with "default arguments" (which is deviating from the BLAS standard, however without compromising the binary compatibility). Default arguments are derived from compile-time constants (configurable) for historic reasons (LIBXSMM's "pre-JIT era").
+
+```C
+libxsmm_?gemm(NULL/*transa*/, NULL/*transb*/,
+  &m/*required*/, &n/*required*/, &k/*required*/,
+  NULL/*alpha*/, a/*required*/, NULL/*lda*/,
+                 b/*required*/, NULL/*ldb*/,
+   NULL/*beta*/, c/*required*/, NULL/*ldc*/);
+```
+
+For the C interface (with type prefix `s` or `d`), all arguments including m, n, and k are passed by pointer. This is needed for binary compatibility with the original GEMM/BLAS interface.
+
+```C
+libxsmm_gemm(NULL/*transa*/, NULL/*transb*/,
+  m/*required*/, n/*required*/, k/*required*/,
+  NULL/*alpha*/, a/*required*/, NULL/*lda*/,
+                 b/*required*/, NULL/*ldb*/,
+   NULL/*beta*/, c/*required*/, NULL/*ldc*/);
+```
+
+The C++ interface is also supplying overloaded versions where m, n, and k can be passed <span>by&#8209;value</span> (making it clearer that m, n, and k are non-optional arguments).
+
+```FORTRAN
+! Dense matrix multiplication (single/double-precision).
+CALL libxsmm_?gemm(m=m, n=n, k=k, a=a, b=b, c=c)
+! Dense matrix multiplication (generic interface).
+CALL libxsmm_gemm(m=m, n=n, k=k, a=a, b=b, c=c)
+```
+
+The FORTRAN interface supports optional arguments (without affecting the binary compatibility with the original BLAS interface) by allowing to omit arguments where the C/C++ interface allows for NULL to be passed.
+
+```C
+/** Dense matrix multiplication (single/double-precision). */
+libxsmm_blas_?gemm(NULL/*transa*/, NULL/*transb*/,
+  &m/*required*/, &n/*required*/, &k/*required*/,
+  NULL/*alpha*/, a/*required*/, NULL/*lda*/,
+                 b/*required*/, NULL/*ldb*/,
+   NULL/*beta*/, c/*required*/, NULL/*ldc*/);
+```
+
+For convenience, a BLAS-based dense matrix multiplication (`libxsmm_blas_gemm`) is provided for all supported languages. This only re-exposes the underlying GEMM/BLAS implementation, but the interface accepts optional arguments (or NULL pointers in C) where the regular GEMM expects a value. To remove any BLAS-dependency, please follow the [Link Instructions](index.md#link-instructions). A BLAS-based GEMM can be useful for validation/benchmark purposes, and more important as a fallback when building an application-specific dispatch mechanism.
+
+```C
+/** OpenMP parallelized dense matrix multiplication. */
+libxsmm_?gemm_omp(&transa, &transb, &m, &n, &k,
+  &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+```
+
+A more recently added variant of matrix multiplication is parallelized based on the OpenMP standard. These routines will open an internal parallel region and rely on "classic" thread based OpenMP. If these routines are called from inside of a parallel region, the parallelism will be based on tasks (<span>OpenMP&#160;3.0</span>). Please note that all OpenMP-based routines are hosted by the extension library (libxsmmext), which keeps the main library agnostic with respect to a threading runtime.
+
+### Manual Code Dispatch
+
+Successively calling a kernel (i.e., multiple times) allows for amortizing the cost of the code dispatch. Moreover, to customize the dispatch mechanism, one can rely on the following interface.
+
+```C
+/** Call dispatched (*function_ptr)(a, b, c [, pa, pb, pc]). */
+libxsmm_[s|d]mmfunction libxsmm_[type-prefix]mmdispatch(
+  libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
+  /** NULL: tight fit (m) */ const libxsmm_blasint* lda,
+  /** NULL: tight fit (k) */ const libxsmm_blasint* ldb,
+  /** NULL: tight fit (m) */ const libxsmm_blasint* ldc,
+  /** NULL: LIBXSMM_ALPHA */ const type* alpha,
+  /** NULL: LIBXSMM_BETA  */ const type* beta,
+  /** NULL: LIBXSMM_FLAGS */ const int* flags,
+  /** NULL: LIBXSMM_PREFETCH_NONE (not LIBXSMM_PREFETCH!) */
+  const int* prefetch);
+```
+
+Overloaded function signatures are provided and allow to omit arguments (C++ and FORTRAN), which are then derived from the [configurable defaults](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_config.h). In C++, `libxsmm_mmfunction<type>` can be used to instantiate a functor rather than making a distinction between numeric types per type-prefix. For lower precision GEMMs, `libxsmm_mmfunction<itype,otype=itype>` optionally takes a second type (output type).
+
+```C
+/* generates or dispatches the code specialization */
+libxsmm_mmfunction<T> xmm(m, n, k);
+if (xmm) { /* JIT'ted code */
+  /* can be parallelized per, e.g., OpenMP */
+  for (int i = 0; i < n; ++i) {
+    xmm(a+i*asize, b+i*bsize, c+i*csize);
+  }
+}
+```
+
+Similarly in FORTRAN (see [samples/smm/smm.f](https://github.com/hfp/libxsmm/blob/master/samples/smm/smm.f)), a generic interface (`libxsmm_mmdispatch`) can be used to dispatch a `LIBXSMM_?MMFUNCTION`. The handle encapsulated by such a `LIBXSMM_?MMFUNCTION` can be called per `libxsmm_call`. Beside of dispatching code, one can also call statically generated kernels (e.g., `libxsmm_dmm_4_4_4`) by using the prototype functions included with the FORTRAN and C/C++ interface. Prototypes are present whenever static code was requested at compile-time of the library (e.g. per `make MNK="1 2 3 4 5"`).
+
+```FORTRAN
+TYPE(LIBXSMM_DMMFUNCTION) :: xmm
+CALL libxsmm_dispatch(xmm, m, n, k)
+IF (libxsmm_available(xmm)) THEN
+  DO i = LBOUND(c, 3), UBOUND(c, 3) ! consider OpenMP
+    CALL libxsmm_dmmcall(xmm, a(:,:,i), b(:,:,i), c(:,:,i))
+  END DO
+END IF
+```
+
+### Batched Multiplication
+
+In case of batched SMMs, it can be beneficial to supply "next locations" such that the upcoming operands are prefetched ahead of time. Such a location would be the address of the next matrix to be multiplied (and not any of the floating-point elements within the "current" matrix-operand). The "prefetch strategy" is requested at dispatch-time of a kernel. A [strategy](libxsmm_be.md#prefetch-strategy) other than `LIBXSMM_PREFETCH_NONE` turns the signature of a JIT'ted kernel into a function with six arguments (`a,b,c, pa,pb,pc` instead of `a,b,c`). To defer the decision about the strategy to a CPUID-based mechanism, one can choose `LIBXSMM_PREFETCH_AUTO`.
+
+```C
+int prefetch = LIBXSMM_PREFETCH_AUTO;
+int flags = 0; /* LIBXSMM_FLAGS */
+libxsmm_dmmfunction xmm = NULL;
+double alpha = 1, beta = 0;
+xmm = libxsmm_dmmdispatch(23/*m*/, 23/*n*/, 23/*k*/,
+  NULL/*lda*/, NULL/*ldb*/, NULL/*ldc*/,
+  &alpha, &beta, &flags, &prefetch);
+```
+
+Above, pointer-arguments of `libxsmm_dmmdispatch` can be NULL (or OPTIONAL in FORTRAN): for LDx this means a "tight" leading dimension, alpha, beta, and flags are given by a [default value](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_config.h) (which is selected at compile-time), and for the prefetch strategy a NULL-argument refers to "no prefetch" (which is equivalent to an explicit `LIBXSMM_PREFETCH_NONE`). By design, the prefetch strategy can be changed at runtime (as soon as valid next-locations are used) without changing the call-site (kernel-signature with six arguments).
+
+<a name="implicit-batches"></a>
+
+```C
+if (0 < n) { /* check that n is at least 1 */
+# pragma parallel omp private(i)
+  for (i = 0; i < (n - 1); ++i) {
+    const double *const ai = a + i * asize;
+    const double *const bi = b + i * bsize;
+    double *const ci = c + i * csize;
+    xmm(ai, bi, ci, ai + asize, bi + bsize, ci + csize);
+  }
+  xmm(a + (n - 1) * asize, b + (n - 1) * bsize, c + (n - 1) * csize,
+  /* pseudo prefetch for last element of batch (avoids page fault) */
+      a + (n - 1) * asize, b + (n - 1) * bsize, c + (n - 1) * csize);
+}
+```
+
+To process a batch of matrix multiplications and to prefetch the operands of the next multiplication ahead of time, the code presented in the [Overview](#overview) section may be modified as shown above. The last multiplication is peeled from the main batch to avoid prefetching out-of-bounds (OOB). Prefetching from an invalid address does not trap an exception, but an (unnecessary) page fault can be avoided.
+
+<a name="explicit-batch-interface"></a>
+
+```C
+/** Batched matrix multiplications (explicit data representation). */
+int libxsmm_mmbatch(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec,
+  const char* transa, const char* transb,
+  libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
+  const void* alpha, const void* a, const libxsmm_blasint* lda,
+                     const void* b, const libxsmm_blasint* ldb,
+   const void* beta,       void* c, const libxsmm_blasint* ldc,
+  libxsmm_blasint index_base, libxsmm_blasint index_stride,
+  const libxsmm_blasint stride_a[],
+  const libxsmm_blasint stride_b[],
+  const libxsmm_blasint stride_c[],
+  libxsmm_blasint batchsize,
+  int tid, int ntasks);
+```
+
+To further simplify the multiplication of matrices in a batch, LIBXSMM's batch interface can help to extract the necessary input from a variety of existing structures (integer indexes, array of pointers both with Byte sized strides). An expert interface (see above) can employ a user-defined threading runtime (`tid` and `ntasks`). In case of OpenMP, `libxsmm_mmbatch_omp` is ready-to-use and hosted by the extension library (libxsmmext). Of course, `libxsmm_mmbatch_omp` does not take `tid` and `ntasks` since both arguments are given by OpenMP. Similarly, a sequential version (shown below) is available per `libxsmm_gemm_batch` (libxsmm).
+
+Please note that an explicit data representation should exist and reused rather than created only to call the explicit batch-interface. Creating such a data structure only for this matter can introduce an overhead which is hard to amortize (speedup). If no explicit data structure exists, a "chain" of multiplications can be often algorithmically described (see [self-hosted batch loop](#implicit-batches)).
+
+```C
+void libxsmm_gemm_batch(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec,
+  const char* transa, const char* transb,
+  libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
+  const void* alpha, const void* a, const libxsmm_blasint* lda,
+                     const void* b, const libxsmm_blasint* ldb,
+   const void* beta,       void* c, const libxsmm_blasint* ldc,
+  libxsmm_blasint index_base, libxsmm_blasint index_stride,
+  const libxsmm_blasint stride_a[],
+  const libxsmm_blasint stride_b[],
+  const libxsmm_blasint stride_c[],
+  libxsmm_blasint batchsize);
+```
+
+<a name="blas-batch-interface"></a>In recent BLAS library implementations, `dgemm_batch` and `sgemm_batch` have been introduced. This BLAS(-like) interface allows for groups of homogeneous batches, which is like an additional loop around the interface as introduced above. On the other hand, the BLAS(-like) interface only supports arrays of pointers for the matrices. In contrast, above interface supports arrays of pointers as well as arrays of indexes plus a flexible way to extract data from arrays of structures (AoS). LIBXSMM also supports this (new) BLAS(-like) interface with `libxsmm_?gemm_batch` and `libxsmm_?gemm_batch_omp` (the latter of which relies on LIBXSMM/ext). Further, existing calls to `dgemm_batch` and `sgemm_batch` can be intercepted and replaced with [LIBXSMM's call wrapper](#call-wrapper). The signatures of `libxsmm_dgemm_batch` and `libxsmm_sgemm_batch` are equal except for the element type (`double` and `float` respectively).
+
+```C
+void libxsmm_dgemm_batch(const char transa_array[], const char transb_array[],
+  const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[],
+  const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[],
+                              const double* b_array[], const libxsmm_blasint ldb_array[],
+  const double  beta_array[],       double* c_array[], const libxsmm_blasint ldc_array[],
+  const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]);
+```
+
+<a name="batch-sync"></a>**Note**: the multi-threaded implementation (`ntasks > 1` or "omp" form of the functions) avoids data races if indexes or pointers for the destination (C-)matrix are duplicated. This synchronization occurs automatically (`beta != 0`), but can be avoided by passing a negative `batchsize`, `group_size` and/or a negative `group_count`.
+
+### User-Data Dispatch
+
+It can be desired to dispatch user-defined data, i.e., to query a value based on a key. This functionality can be used to, e.g., dispatch multiple kernels in one step if a code location relies on multiple kernels. This way, one can pay the cost of dispatch one time per task rather than according to the number of JIT-kernels used by this task. This functionality is detailed in the section about [Service Functions](libxsmm_aux.md#user-data-dispatch).
+
+### Call Wrapper
+
+#### Overview
+
+Since the library is binary compatible with existing GEMM calls (BLAS), such calls can be replaced at link-time or intercepted at runtime of an application such that LIBXSMM is used instead of the original BLAS library. There are two cases to consider: <span>(1)&#160;static</span> linkage, and <span>(2)&#160;dynamic</span> linkage of the application against the original BLAS library. When calls are intercepted, one can select a sequential (default) or an OpenMP-parallelized implementation (`make WRAP=2`).
+
+```bash
+LIBXSMM STATISTIC: 1000 multiplications
+dgemm(trans=NN mnk=32,32,21 ldx=32,21,32 a,b=1,0): 8% [main$omp$1]
+dgemm(trans=NN mnk=32,21,32 ldx=32,32,32 a,b=1,0): 8% [main$omp$1]
+dgemm(trans=NN mnk=10,21,32 ldx=10,32,10 a,b=1,0): 5% [main$omp$1]
+dgemm(trans=NN mnk=32,10,32 ldx=32,32,32 a,b=1,0): 5% [main$omp$1]
+dgemm(trans=NN mnk=32,32,10 ldx=32,10,32 a,b=1,0): 5% [main$omp$1]
+```
+
+Intercepted GEMMs can also build a sophisticated statistic (histogram) with LIBXSMM_VERBOSE=4 (or higher). The histogram displays the call sites (debug symbol name) of all intercepted GEMMs ([example](https://github.com/hfp/libxsmm/blob/master/samples/utilities/wrap/autobatch.c) above depicts an OpenMP region hosted by the main function). With <span>level&#160;5</span> (or higher), the histogram yields the entire content, and eventually less relevant entries are not pruned. An application must be built with symbols (`-g`) and export symbols similar to shared libraries (`-Wl,--export-dynamic` even when linked statically) in order to display the symbol names of where the GEMMs originated (call site).
+
+**Note**: Intercepting GEMM calls is low effort but implies overhead, which can be relatively high for small-sized problems. LIBXSMM's native programming interface has lower overhead and allows to amortize this overhead when using the same multiplication kernel in a consecutive fashion along with sophisticated data prefetch.
+
+#### Static Linkage
+
+An application which is linked statically against BLAS requires to wrap the `sgemm_` and the `dgemm_` symbol (an alternative is to wrap only `dgemm_`). To relink the application (without editing the build system) can often be accomplished by copying and pasting the linker command as it appeared in the console output of the build system, and then re-invoking a modified link step (please also consider `-Wl,--export-dynamic`).
+
+```bash
+gcc [...] -Wl,--wrap=dgemm_,--wrap=sgemm_ \
+          /path/to/libxsmmext.a /path/to/libxsmm.a \
+          /path/to/your_regular_blas.a
+```
+
+In addition, existing [BLAS(-like) batch-calls](#blas-batch-interface) can be intercepted as well:
+
+```bash
+gcc [...] -Wl,--wrap=dgemm_batch_,--wrap=sgemm_batch_ \
+          -Wl,--wrap=dgemm_batch,--wrap=sgemm_batch \
+          -Wl,--wrap=dgemm_,--wrap=sgemm_ \
+          /path/to/libxsmmext.a /path/to/libxsmm.a \
+          /path/to/your_regular_blas.a
+```
+
+Above, GEMM and GEMM_BATCH are intercepted both, however this can be chosen independently. For GEMM_BATCH the Fortran and C-form of the symbol may be intercepted both (regular GEMM can always be intercepted per `?gemm_` even when `?gemm` is used in C-code).
+
+**Note**: The static link-time wrapper technique may only work with a GCC tool chain (<span>GNU&#160;Binutils</span>: `ld`, or `ld` via compiler-driver), and it has been tested with <span>GNU&#160;GCC</span>, <span>Intel&#160;Compiler</span>, and Clang. However, this does not work under Microsoft Windows (even when using the GNU tool chain or Cygwin).
+
+#### Dynamic Linkage
+
+An application that is dynamically linked against BLAS allows to intercept the GEMM calls at startup time (runtime) of the unmodified executable by using the LD_PRELOAD mechanism. The shared library of LIBXSMMext (`make STATIC=0`) can be used to intercept GEMM calls:
+
+```bash
+LD_LIBRARY_PATH=/path/to/libxsmm/lib:${LD_LIBRARY_PATH} \
+LD_PRELOAD=libxsmmext.so \
+   ./myapplication
+```
+
--- a/third_party/libxsmm/documentation/libxsmm_prof.md
+++ b/third_party/libxsmm/documentation/libxsmm_prof.md
+## Performance Analysis
+
+### <span>Intel&#160;VTune&#160;Profiler</span><a name="intel-vtune-amplifier"></a>
+
+To analyze which kind of kernels have been called, and from where these kernels have been invoked (call stack), the library allows profiling its JIT code using <span>Intel&#160;VTune&#160;Profiler</span>. To enable this support, VTune's root directory needs to be set at build-time of the library. Enabling symbols (SYM=1 or DBG=1) incorporates VTune's JIT Profiling API:
+
+```bash
+source /opt/intel/vtune_profiler/vtune-vars.sh
+make SYM=1
+```
+
+Above, the root directory is automatically determined from the environment (VTUNE_PROFILER_\*_DIR or VTUNE_AMPLIFIER_\*_DIR with older versions). This variable is present after source'ing the <span>Intel&#160;VTune</span> environment (`source /path/to/vtune_amplifier/amplxe-vars.sh` with older version), but it can be manually provided as well (`make VTUNEROOT=/path/to/vtune_amplifier`). Symbols are not really required to display kernel names for the dynamically generated code, however enabling symbols makes the analysis much more useful for the rest of the (static) code, and hence it has been made a prerequisite. For example, when "call stacks" are collected it is possible to find out where the JIT code has been invoked by the application:
+
+```bash
+vtune -r resultdir -data-limit 0 -collect hotspots \
+      -knob enable-stack-collection=true \
+      -knob sampling-mode=hw \
+      -knob stack-size=0 \
+      -- ./myapplication
+```
+
+In case of an MPI-parallelized application, it can be useful to only collect results from a "representative" rank, and to also avoid running the event collector in every rank of the application. With <span>Intel&#160;MPI</span> both of which can be achieved by:
+
+```bash
+mpirun -gtool 'vtune -r resultdir -data-limit 0 -collect hotspots \
+               -knob sampling-mode=hw -knob enable-stack-collection=true \
+               -knob stack-size=0:4=exclusive' \
+  [...] ./myapplication
+```
+
+The `:4=exclusive` is related to Intel MPI or mpirun's gtool arguments and unrelated to VTune's command line syntax (see `vtune --help` or `amplxe-cl --help` with older versions); such argument(s) need to appear at the end of the gtool-string. For instance, the shown command line selects the 5th rank (zero-based) along with exclusive usage of the performance monitoring unit (PMU) such that only one event-collector runs for all ranks (without rank-number, all ranks are sampled).
+
+<a name="vtune-jit-api"></a><span>Intel&#160;VTune&#160;Profiler</span> presents invoked JIT code like functions, which belong to a module named "libxsmm.jit". The function name as well as the module name are supplied by LIBXSMM using VTune's JIT-Profiling API. Below, the shown "function name" (`libxsmm_knl_dnn_23x23x23_23_23_23_a1_b1_p6::mxm`) encodes an AVX-512 ("knl") double-precision kernel ("d") for small dense matrix multiplication, which performs no transposes ("nn"). The name further encodes M=N=K=LDA=LDB=LDC=23, Alpha=Beta=1.0, and a prefetch strategy ("p6").
+
+![The shown "function name" (`libxsmm_knl_dnn_23x23x23_23_23_23_a1_b1_p6::mxm`) encodes an <span>Intel&#160;AVX-512</span> ("knl") double-precision kernel ("d") for small dense matrix multiplication, which performs no transposes ("nn"). The name further encodes M=N=K=LDA=LDB=LDC=23, Alpha=Beta=1.0, and some prefetch strategy ("p6").](libxsmm_prof-vtune.png)
+
+An application that cannot rely on LIBXSMM's build system can apply `-DLIBXSMM_VTUNE=2` during compilation, and link against `${VTUNE_AMPLIFIER_XE_2017_DIR}/lib64/libjitprofiling.a`. For example, TensorFlow with LIBXSMM and <span>Intel&#160;VTune&#160;Profiler</span> may use this way to gain insight into LIBXSMM's JIT-code (see [here](tensorflow.md#performance-profiling)).
+
+### <span>Linux&#160;perf</span>
+
+With LIBXSMM, there is both basic (`perf map`) and extended support (`jitdump`) when profiling an application. To enable perf support at runtime, the environment LIBXSMM_VERBOSE needs to be set to a negative value.
+
+* The basic support can be enabled at compile-time with PERF=1 (implies SYM=1) using `make PERF=1`. At runtime of the application, a map-file ('jit-*pid*.map') is generated ('/tmp' directory). This file is automatically read by <span>Linux&#160;perf</span>, and enriches the information about unknown code such as JIT'ted kernels.
+* The support for "jitdump" can be enabled by supplying JITDUMP=1 (implies PERF=1) or PERF=2 (implies JITDUMP=1) when making the library: `make JITDUMP=1` or `make PERF=2`. At runtime of the application, a dump-file ('jit-*pid*.dump') is generated (in perf's debug directory, usually `$HOME/.debug/jit/`) which includes information about JIT'ted kernels (such as addresses, symbol names, code size, and the code itself). The dump file can be injected into `perf.data` (using `perf inject -j`), and it enables an annotated view of the assembly in perf's report (requires a reasonably recent version of <span>Linux&#160;perf</span>).
+
--- a/third_party/libxsmm/documentation/libxsmm_qna.md
+++ b/third_party/libxsmm/documentation/libxsmm_qna.md
+## What is the background of the name "LIBXSMM"?
+The "MM" stands for Matrix Multiplication, and the "S" clarifies the working domain i.e., Small Matrix Multiplication. The latter also means the name is neither a variation of "MXM" nor an eXtreme Small Matrix Multiplication but rather about Intel Architecture (x86) - and no, the library is [64&#8209;bit only](https://github.com/hfp/libxsmm/issues/103#issuecomment-256887962). The spelling of the name might follow the syllables of libx\\/smm, libx'smm, or libx&#8209;smm.
+> **NOTE**: the library does [not](https://github.com/hfp/libxsmm/issues/103#issuecomment-256887962) support 32-bit architecture (64&#8209;bit only)
+
+## What is a small matrix multiplication?
+When characterizing the problem-size using the M, N, and K parameters, a problem-size suitable for LIBXSMM falls approximately within *(M&#160;N&#160;K)<sup>1/3</sup>&#160;\<=&#160;128* (which illustrates that non-square matrices or even "tall and skinny" shapes are covered as well). The library is typically used to generate code up to the specified [threshold](#auto-dispatch). Raising the threshold may not only generate excessive amounts of code (due to unrolling in M or K dimension), but also miss to implement a tiling scheme to effectively utilize the cache hierarchy. For auto-dispatched problem-sizes above the configurable threshold (explicitly JIT'ted code is **not** subject to the threshold), LIBXSMM is falling back to BLAS. In terms of GEMM, the supported kernels are limited to *Alpha := 1*, *Beta := \{ 1, 0 \}*, and *TransA := 'N'*.
+> **NOTE**: *Alpha*, *Beta*, and *TransA* are limited to `1`, `{ 1, 0 }`, and `'N'` respectively.
+
+## What is a small convolution?
+In the last years, new workloads such as deep learning and more specifically convolutional neural networks (CNN) emerged, and are pushing the limits of today's hardware. One of the expensive kernels is a small convolution with certain kernel sizes (3, 5, or 7) such that calculations in the frequency space is not the most efficient method when compared with direct convolutions. LIBXSMM's current support for convolutions aims for an easy to use invocation of small (direct) convolutions, which are intended for CNN training and classification. The [Interface](#interface-for-convolutions) is currently ramping up, and the functionality increases quickly towards a broader set of use cases.
+
+## What about "medium-sized" and big(ger) matrix multiplications?
+A more recent addition are GEMM routines, which are parallelized using OpenMP (`libxsmm_?gemm_omp`). These routines leverage the same specialized kernel routines as the small matrix multiplications, in-memory code generation (JIT), and automatic code/parameter dispatch but they implement a tile-based multiplication scheme i.e., a scheme that is suitable for larger problem-sizes. For *Alpha*, *Beta*, *TransA*, and *TransB*, the limitations of the small matrix multiplication kernels apply. More details can be found in the [description of the xgemm sample code](https://github.com/hfp/libxsmm/tree/master/samples/xgemm#xgemm-tiled-gemm-routines).
+
+## How to determine whether an application can benefit from using LIBXSMM or not?
+Given the application uses BLAS to carry out matrix multiplications, one may use the [Call Wrapper](#call-wrapper), and measure the application performance e.g., time to solution. However, the latter can significantly improve when using LIBXSMM's API directly. To check whether there are applicable GEMM-calls, the [Verbose Mode](#verbose-mode) can help to collect an insight. Further, when an application uses [Intel&#160;MKL&#160;11.2](https://registrationcenter.intel.com/en/forms/?productid=2558) (or higher), then running the application with the environment variable MKL_VERBOSE=1 (`env MKL_VERBOSE=1 ./workload > verbose.txt`) can collect a similar insight (`grep -a "MKL_VERBOSE DGEMM(N,N" verbose.txt | cut -d'(' -f2 | cut -d, -f3-5"`).
+
+## Is LIBXSMM compatible from version-to-version, or what is the ABI commitment?
+One may have a look at issue [#120](https://github.com/hfp/libxsmm/issues/120#issuecomment-264498939) or [#282](https://github.com/hfp/libxsmm/issues/282#issuecomment-485390494), but in summary:
+* Binary compatibility is not continuously tested (only manually for a subset of the API namely SMM domain).
+* Major versions are likely breaking binary compatibility with existing integrations (that is typical).
+* Minor versions may break binary compatibility of recently introduced features (may not be typical).
+* Update and patch versions are binary compatible but may only be released on request (issue).
+
+LIBXSMM's API for Small Matrix Multiplications (SMMs) is considered stable, and all major known applications (e.g., CP2K, EDGE, NEK5K, and SeisSol) either rely on SMMs or are able (and want) to benefit from an improved API of any of the other domains (e.g., DL). Until at least v2.0, LIBXSMM is not able to track or even maintain binary compatibility and hence the SONAME also goes with the semantic version. A [list of public functions](https://github.com/hfp/libxsmm/blob/master/.abi.txt) is maintained (but there is no distinction for a small subset of them that are only meant for communication between LIBXSMM and LIBXSMM/ext).
+
+## I am relying on a prebuilt version of CP2K (or another application), is LIBXSMM incorporated and which version is it?
+This can be determined using the environment variable `LIBXSMM_VERBOSE=2` (or higher verbosity). It is not even required to use an input or workload since the information in question is presented when the program terminates. For example:
+
+```
+LIBXSMM_VERBOSE=1 exe/Linux-x86-64-intelx/cp2k.psmp
+[...]
+LIBXSMM_VERSION: release-1.11
+LIBXSMM_TARGET: clx
+```
+
+## I am relying on a prebuilt version of an application, and I am concerned about optimal compiler flags.
+LIBXSMM uses JIT-generated code according to the CPUID of the system. This is independent of the compiler flags used to build the library. If LIBXSMM was incorporated per [classic ABI](https://libxsmm.readthedocs.io/#classic-library-abi), `LIBXSMM_DUMP_BUILD=1` environment variable allows to print build flags used for LIBXSMM at termination of the application. This output of `LIBXSMM_DUMP_BUILD=1` can yield hints about the flags used to build the application (if similar).
+
+For concerns regarding the code of an application that cannot benefit from LIBXSMM, one may have a look at the build recipes of the [XCONFIGURE](http://xconfigure.readthedocs.io/) project.
+
+## What Operating Systems are covered by LIBXSMM, and what about Microsoft Windows?
+The answer here focuses on the actual runtime support rather than the supported compiler tool chains used to build the library. All flavors of Linux are supported (if the library was successfully built), which includes installations running a security-hardened Linux kernel (SELinux). The Apple OS (OSX) is supported, which also includes more recent SIP-enabled versions (System Integrity Protection). The BSD OS is likely supported, but building the library is only occasionally validated. Microsoft Windows is supported for non-JIT operation, and for most (e.g., GEMM and MATCOPY) of the JIT-kernels (prefetch signature is not supported). There is currently no support for JIT in the DNN domain (no further check is performed i.e., crash at runtime). See also [issue #71](https://github.com/hfp/libxsmm/issues/71).
+
+## Does LIBXSMM has some support for GEMV?
+The library generates acceptable code when using `M=1` or `N=1`. For example, building with `make M=16 N=1 K=16 AVX=2` and inspecting the assembly (build directory) or dumping/disassembling the JIT code (see reference documentation) shows the minimum number of load/store instructions. Given that GEMV is a memory bound operation, this suggests reasonable code quality. LIBXSMM selects from multiple microkernels (specific for each ISA extension) by using a fixed scheme/heuristic, which should be acceptable for GEMV. The sample code under [samples/smm](https://github.com/hfp/libxsmm/blob/master/samples/smm) provides ready-to-use benchmark drivers that can help to compare the performance with LAPACK/BLAS. Afore mentioned benchmarks exercise streaming all possible combinations of operands.
+
+## What about complex and mixed types?
+This question refers to the following kind of element type of the GEMM interface of LIBXSMM:
+* Complex types: complex numbers in single and double-precision,
+* Mixed types: e.g. real double-precision and complex double-precision
+There are no (immediate) plans to support more types for the GEMM part. Please note, that LIBXSMM indeed supports lower precision GEMM (wgemm).
+
+## What about voting for features?
+All feedback and [issue reports](https://github.com/hfp/libxsmm/issues) are handled openly, are welcome and considered ([answered](https://github.com/hfp/libxsmm/issues?q=is%3Aissue+is%3Aclosed), and [collected](https://github.com/hfp/libxsmm/wiki/Development#longer-term-issues)). However, we do not seek for "feature votes" since the development of the library is not a democratic process.
+
+## \<DEPRECATED\> What is the purpose of ROW_MAJOR vs. COL_MAJOR?
+This build configuration is deprecated ([issue 85](https://github.com/hfp/libxsmm/issues/85)), otherwise there is nothing one cannot achieve with row-major as opposed to column-major storage order. In particular the choice is not about whether a program is written in C/C++ or in FORTRAN. The ROW_MAJOR setting is just offered for existing code, which calls into function(s) that assume row-major storage order and where these calls are to be replaced by LIBXSMM in a "1:1 fashion". It is encouraged to avoid the ROW_MAJOR setting since BLAS implies COL_MAJOR (and LIBXSMM is supposed to be compatible with BLAS). [More...](https://github.com/hfp/libxsmm/issues/80)
--- a/third_party/libxsmm/documentation/libxsmm_samples.md
+++ b/third_party/libxsmm/documentation/libxsmm_samples.md
+# [LIBXSMM Samples](https://github.com/hfp/libxsmm/raw/master/documentation/libxsmm_samples.pdf)
+
+## CP2K Artificial Benchmark
+
+The first code sample given for LIBXSMM was a performance reproducer exercising the same set of kernels usually generated for CP2K's SMM library. The code sample attempted to model the way "matrix stacks" are processed in CP2K, however there are two different code paths in CP2K: (1) the "main" code path used when processing stacks on the host-side, and (2) a code path targeting offload devices. Beside of the host-sided parallelization via MPI (and perhaps OpenMP), the secondly mentioned code path relies on an additional level of parallelization (which is obviously necessary to drive a potentially highly parallel offload device). Also, the additional level of parallelism is not exactly "nested" in the sense that it participates on sharing the same resources as the host-side. In fact, this "artificial benchmark" (cp2k code sample) is modeling a code path as utilized in the secondly mentioned case (offload device).
+
+## Hello LIBXSMM
+
+This example is focused on a specific functionality but may be considered as "Hello LIBXSMM". Copy and paste the example code and build it either manually and as described in our [main documentation](https://libxsmm.readthedocs.io/#hello-libxsmm) (see underneath the source code), or use GNU Make:
+
+```bash
+cd /path/to/libxsmm
+make
+
+cd /path/to/libxsmm/samples/hello
+make
+
+./hello
+```
+
+Alternatively, one can use the Bazel build system. To further simplify, [Bazelisk](https://github.com/bazelbuild/bazelisk) is used to boot-strap [Bazel](https://bazel.build/):
+
+```bash
+cd /path/to/libxsmm/samples/hello
+bazelisk build //...
+
+./bazel-bin/hello
+```
+
+The [C/C++ code](https://github.com/hfp/libxsmm/blob/master/samples/hello/hello.cpp) given here uses LIBXSMM in header-only form (`#include <libxsmm_source.h>`), which is in contrast to the code shown in the [main documentation](https://libxsmm.readthedocs.io/#hello-libxsmm). The [Fortran code](https://github.com/hfp/libxsmm/blob/master/samples/hello/hello.f) (`hello.f`) can be manually compiled like `gfortran -I/path/to/libxsmm/include hello.f -L/path/to/libxsmm/lib -libxsmmf -lxsmm -lxsmmnoblas -o hello` or as part of the above described invocation of GNU Make.
+
+## Magazine
+
+### Overview
+
+This collection of code samples accompany an article written for [issue&#160;#34](https://software.intel.com/sites/default/files/parallel-universe-issue-34.pdf) of the magazine [The Parallel Universe](https://software.intel.com/en-us/download/parallel-universe-magazine-issue-34-october-2018), an Intel publication. The articles focuses on Blaze-, Eigen-, and LIBXSMM-variants of Small Matrix Multiplications (SMMs). The set of sample codes now also includes a variant relying on BLAS and a variant that showcases LIBXSMM's explicit batch-interface.
+
+The baseline requirements are libraries that can operate on column-major storage order, "zero copy" when using existing memory buffers, and an API that is powerful enough to describe leading dimensions. Typically a library-internal parallelization of matrix multiplication is desired. However, for the magazine sample collection there is no performance gain expected since the matrices are small, and nested parallelism may only add overhead. Hence library-internal parallelism is disabled (BLAZE_USE_SHARED_MEMORY_PARALLELIZATION=0, EIGEN_DONT_PARALLELIZE). LIBXSMM provides parallelization on a per-functions basis and no global toggle is needed.
+
+The sample codes rely on the minimum programming language supported by the library in question (API): C++ in case of Blaze and Eigen, and C in case of LIBXSMM (both C++ and Fortran interfaces are available as well). For Blaze and Eigen, the build-system ensures to not map implementation into a BLAS library (normally desired but this would not test the library-native implementation).
+
+### Results
+
+To reproduce or repeat the performance measurements on a system of choice, all matrix operands are streamed by default. The file [magazine.h](https://github.com/hfp/libxsmm/blob/master/samples/magazine/magazine.h) can be edited to reproduce the desired combination (STREAM_A, STREAM_B, and STREAM_C). Whether or not matrix operands are streamed is motivated in publication. To reduce dependency on the compiler's OpenMP implementation, the benchmarks run single-threaded by default (`make OMP=1` can parallelize the batch of matrix multiplications). The outer/batch-level parallelization is also disabled to avoid accounting for proper first-touch memory population on multi-socket systems (NUMA). For the latter, the init-function (located in magazine.h) is not parallelized for simplicity.
+
+```bash
+cd libxsmm; make
+cd samples/magazine; make
+```
+
+To run the benchmark kernels presented by the article:
+
+```bash
+./benchmark.sh
+```
+
+Please note that if multiple threads are enabled and used, an appropriate pin-strategy should be used (OMP_PLACES=threads, OMP_PROC_BIND=TRUE). To finally produce the benchmark charts:
+
+```bash
+./benchmark-plot.sh blaze
+./benchmark-plot.sh eigen
+./benchmark-plot.sh xsmm
+```
+
+The plot script relies at least on Gnuplot. ImageMagick (mogrify) can be also useful if PNGs are created, e.g., `./benchmark-plot.sh xsmm png 0` (the last argument disables single-file charts in contrast to multi-page PDFs created by default, the option also disables chart titles).
+
+The set of kernels executed during the benchmark can be larger than the kernels presented by the plots: [benchmark.set](https://github.com/hfp/libxsmm/blob/master/samples/magazine/benchmark.set) selects the kernels independent of the kernels executed (union).
+
+## NEK Sample Collection
+
+This directory contains kernels taken from Nek{Box,5000}. They aim to represent most of the matrix-matrix workloads.
+
+Please note that the [mxm_std.f](https://github.com/hfp/libxsmm/blob/master/samples/nek/mxm_std.f) source code is protected by an (US) GOVERNMENT LICENSE, and under the copyright of the University of Chicago.
+
+### stpm
+
+Small tensor-product multiple (stpm) replicates the axhelm kernel, which computes the Laplacian with spectral elements.
+Usage:
+
+```bash
+./stpm m n k size1 size
+```
+
+The elements are m-by-n-by-k, mode picks the LIBXSMM interface used, and size scales the number of spectral elements.
+
+### rstr
+
+Restriction operator transforms elements from one size to another. This occurs in multi-grid, the convection operator, and, when the sizes are the same, the local Schwarz solves. Usage:
+
+```bash
+./rstr m n k mm nn kk size1 size
+```
+
+The input elements are m-by-n-by-k and the output elements are mm-by-nn-by-kk. When m=mm, n=nn, k=kk, this half of a Schwarz solve.
+
+## SMM Sample Collection
+
+This collection of code samples exercises different memory streaming cases when performing the matrix multiplication *C~m&#8239;x&#8239;n~ = alpha &middot; A~m&#8239;x&#8239;k~ &middot; B~k&#8239;x&#8239;n~ + beta &middot; C~m&#8239;x&#8239;n~*: (1)&#160;streaming the matrices A, B, and C which is usually referred as batched matrix multiplication, (2)&#160;streaming the inputs A and B but accumulating C within cache, (3)&#160;streaming the A and C matrices while B is kept in cache, (4)&#160;streaming the B and C matrices while A is kept in cache, and (4)&#160;not streaming any of the operands but repeating the very same multiplication until the requested number of matrix multiplications has been completed.
+
+Beside of measuring the duration of a test case, the performance is presented in GFLOPS/s. As an alternative metric, the memory bandwidth is given (the artificial "cached" case omits to present the cache-memory bandwidth). The "pseudo-performance" given in FLOPS/cycle is an artificial scoring, it not only uses a non-standard formula for calculating the FLOPS (*2 \* M \* N \* K - M \* N* rather than *2 \* M \* N \* K*) but also relies on (pseudo-)clock cycles:
+
+```
+$ ./specialized.sh 0
+m=32 n=32 k=32 size=87381 memory=2048.0 MB (DP)
+
+Batched (A,B,C)...
+        pseudo-perf.: 10.7 FLOPS/cycle
+        performance: 23.9 GFLOPS/s
+        bandwidth: 11.1 GB/s
+        duration: 239 ms
+Finished
+```
+
+There are two sub collections of samples codes: (1)&#160;a collection of C++ code samples showing either BLAS, Compiler-generated code (inlined code), LIBXSMM/dispatched, LIBXSMM/specialized functions to carry out the multiplication, and (2)&#160;a Fortran sample code showing BLAS versus LIBXSMM including some result validation.
+
+**C/C++ Code Samples: Command Line Interface (CLI)**
+
+* Takes an optional number (1st arg.) to select the streaming-case (0...8)
+* Optionally takes the M, N, and K parameter of the GEMM in this order
+* If only M is supplied, the N and K "inherit" the M-value
+* Example I  (A,B,C): ./specialized.sh 0 16 8 9
+* Example II   (A,B): ./specialized.sh 6 16
+
+**Fortran Code Sample: Command Line Interface (CLI)**
+
+* Optionally takes the M, N, and K parameter of the GEMM in this order
+* Optional problem size (in MB) of the workload; M/N/K must have been supplied
+* Optional total problem size (in MB) implying the number of repeated run
+* If only M is supplied, the N and K are "inheriting" the M-value
+* Shows the performance of each of the streaming cases
+* Example I: ./smm.sh 16 8 9 1024 16384
+* Example II: ./smm.sh 16
+
+## SPECFEM Sample
+
+This sample contains a dummy example from a spectral-element stiffness kernel taken from [SPECFEM3D_GLOBE](https://github.com/geodynamics/specfem3d_globe).
+
+It is based on a 4th-order, spectral-element stiffness kernel for simulations of elastic wave propagation through the Earth. Matrix sizes used are (25,5), (5,25) and (5,5) determined by different cut-planes through a three dimensional (5,5,5)-element with a total of 125 GLL points.
+
+
+### Usage Step-by-Step
+
+This example needs the LIBXSMM library to be built with static kernels, using MNK="5 25" (for matrix size (5,25), (25,5) and (5,5)).
+
+#### Build LIBXSMM
+
+##### General Default Compilation
+
+In LIBXSMM root directory, compile the library with:
+
+```bash
+make MNK="5 25" ALPHA=1 BETA=0
+```
+
+##### Additional Compilation Examples
+
+Compilation using only single precision version and aggressive optimization:
+
+```bash
+make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3
+```
+
+For Sandy Bridge CPUs:
+
+```bash
+make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1
+```
+
+For Haswell CPUs:
+
+```bash
+make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=2
+```
+
+For Knights Corner (KNC) (and thereby creating a Sandy Bridge version):
+
+```bash
+make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 \
+OFFLOAD=1 KNC=1
+```
+
+Installing libraries into a sub-directory workstation/:
+
+```bash
+make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 \
+OFFLOAD=1 KNC=1 \
+PREFIX=workstation/ install-minimal
+```
+
+#### Build SpecFEM example code
+
+For default CPU host:
+
+```bash
+cd sample/specfem
+make
+```
+
+For Knights Corner (KNC):
+
+```bash
+cd sample/specfem
+make KNC=1
+```
+
+Additionally, adding some specific Fortran compiler flags, for example:
+
+```bash
+cd sample/specfem
+make FCFLAGS="-O3 -fopenmp" [...]
+```
+
+Note that steps 1 and 2 could be shortened by specifying a "specfem" make target in the LIBXSMM root directory:
+
+```bash
+make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 specfem
+```
+
+For Knights Corner, this would need two steps:
+
+```bash
+make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 OFFLOAD=1 KNC=1
+make OPT=3 specfem_mic
+```
+
+### Run the Performance Test
+
+For default CPU host:
+
+```bash
+./specfem.sh
+```
+
+For Knights Corner (KNC):
+
+```bash
+./specfem.sh -mic
+```
+
+### Results
+
+Using Intel Compiler suite: icpc 15.0.2, icc 15.0.2, and ifort 15.0.2.
+
+#### Sandy Bridge - Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz
+
+Library compilation by (root directory):
+
+```bash
+make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1
+```
+
+Single threaded example run:
+
+```bash
+cd sample/specfem
+make; OMP_NUM_THREADS=1 ./specfem.sh
+```
+
+Output:
+
+```bash
+===============================================================
+average over           15 repetitions
+ timing with Deville loops    =   0.1269
+ timing with unrolled loops   =   0.1737 / speedup =   -36.87 %
+ timing with LIBXSMM dispatch =   0.1697 / speedup =   -33.77 %
+ timing with LIBXSMM prefetch =   0.1611 / speedup =   -26.98 %
+ timing with LIBXSMM static   =   0.1392 / speedup =    -9.70 %
+===============================================================
+```
+
+#### Haswell - Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
+
+Library compilation by (root directory):
+
+```bash
+make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=2
+```
+
+Single threaded example run:
+
+```bash
+cd sample/specfem
+make; OMP_NUM_THREADS=1 ./specfem.sh
+```
+
+Output:
+
+```bash
+===============================================================
+average over           15 repetitions
+ timing with Deville loops    =   0.1028
+ timing with unrolled loops   =   0.1385 / speedup =   -34.73 %
+ timing with LIBXSMM dispatch =   0.1408 / speedup =   -37.02 %
+ timing with LIBXSMM prefetch =   0.1327 / speedup =   -29.07 %
+ timing with LIBXSMM static   =   0.1151 / speedup =   -11.93 %
+===============================================================
+```
+
+Multi-threaded example run:
+
+```bash
+cd sample/specfem
+make OPT=3; OMP_NUM_THREADS=24 ./specfem.sh
+```
+
+Output:
+
+```bash
+OpenMP information:
+  number of threads =           24
+
+[...]
+
+===============================================================
+average over           15 repetitions
+ timing with Deville loops    =   0.0064
+ timing with unrolled loops   =   0.0349 / speedup =  -446.71 %
+ timing with LIBXSMM dispatch =   0.0082 / speedup =   -28.34 %
+ timing with LIBXSMM prefetch =   0.0076 / speedup =   -19.59 %
+ timing with LIBXSMM static   =   0.0068 / speedup =    -5.78 %
+===============================================================
+```
+
+#### Knights Corner - Intel Xeon Phi B1PRQ-5110P/5120D
+
+Library compilation by (root directory):
+
+```bash
+make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 OFFLOAD=1 KNC=1
+```
+
+Multi-threaded example run:
+
+```bash
+cd sample/specfem
+make FCFLAGS="-O3 -fopenmp -warn" OPT=3 KNC=1; ./specfem.sh -mic
+```
+
+Output:
+
+```bash
+OpenMP information:
+  number of threads =          236
+
+[...]
+
+===============================================================
+average over           15 repetitions
+ timing with Deville loops    =   0.0164
+ timing with unrolled loops   =   0.6982 / speedup = -4162.10 %
+ timing with LIBXSMM dispatch =   0.0170 / speedup =    -3.89 %
+ timing with LIBXSMM static   =   0.0149 / speedup =     9.22 %
+===============================================================
+```
+
+## Matrix Transpose (TCOPY)
+
+### Overview
+
+This code sample aims to benchmark the performance of matrix transposes. The C/C++ and [FORTRAN sample code](https://github.com/hfp/libxsmm/blob/master/samples/transpose/transpose.f) differ slightly with the C/C++ code sample offering a richer set of command line options as well as build settings available inside of the [translation unit](https://github.com/hfp/libxsmm/blob/master/samples/transpose/transpose.c).
+
+The available command line options of the sample code may be reviewed by looking into the source code. Generally, the idea is to support the following:
+
+> transpose  [&lt;kind&gt;  [&lt;m&gt;  [&lt;n&gt;  [&lt;ldi&gt;  [&lt;ldo&gt;]]]]]  
+transposef                [&lt;m&gt;  [&lt;n&gt;  [&lt;ldi&gt;  [&lt;ldo&gt;]]]]
+
+Above, `m` and `n` specify the matrix shape, and `ldi` the leading dimension of the matrix. The argument `ldo` allows to specify an output dimension, which may differ from `ldi`. The transpose kind shall be either out-of-place (`o`) or in-place (`i`).
+
+Running the C sample code may look like:
+
+```bash
+$ ./transpose.sh o 20000
+m=20000 n=20000 ldi=20000 ldo=20000 size=3052MB (double, out-of-place)
+        bandwidth: 18.8 GB/s
+        duration: 159 ms
+```
+
+Instead of executing a wrapper script, one may affinitize the multi-threaded execution manually (OpenMP runtime). In case of an executable built using the Intel Compiler this may look like:
+
+```bash
+LIBXSMM_VERBOSE=2 KMP_AFFINITY=balanced,granularity=fine,1 \
+./transpose o 20000
+m=20000 n=20000 ldi=20000 ldo=20000 size=3052MB (double, out-of-place)
+        bandwidth: 21.1 GB/s
+        duration: 141 ms
+
+Registry: 20 MB (gemm=0 mcopy=0 tcopy=1)
+```
+
+In the above case one can see from the verbose output (`LIBXSMM_VERBOSE=2`) that one kernel (tcopy) served transposing the entire matrix. To avoid duplicating JIT-kernels under contention (code registry), one may also consider `LIBXSMM_TRYLOCK=1`, which is available per API-call as well.
+
+### OpenTuner
+
+To tune the tile sizes ("block sizes") internal to LIBXSMM's transpose routine, the [OpenTuner](http://opentuner.org/) extensible framework for program autotuning can be used. In case of issues during the tuning phase ("no value has been set for this column"), please install the latest 1.2.x revision of SQLAlchemy (`pip install sqlalchemy==1.2.19`). A tuning script (`transpose_opentuner.py`) is provided, which accepts a range of matrix sizes as command line arguments.
+
+> transpose_opentuner.py &lt;begin&gt; &lt;end&gt; [*nexperiments-per-epoch*] [*tile-size-m*] [*tile-size-n*]
+
+To start a tuning experiment for a new set of arguments, it is highly recommended to start from scratch. Otherwise the population of previously generated tuning results is fetched from a database and used to tune an eventually unrelated range of matrix shapes. To get reliable timings, the total time for all experiments per epoch is minimized (hence a different number of experiments per epoch also asks for an own database). Optionally, the initial block size can be seeded (`tile-size-m` and `tile-size-n`).
+
+```bash
+rm -rf opentuner.db
+```
+
+The script tunes matrices with randomized shape according to the specified range. The leading dimension is chosen tightly for the experiments. The optimizer not only maximizes the performance but also minimizes the value of *M&#160;\*&#160;N* (which also helps to prune duplicated results due to an additional preference).
+
+```bash
+rm -rf opentuner.db
+./transpose_opentuner.py --no-dups 1 1024 1000
+
+rm -rf opentuner.db
+./transpose_opentuner.py --no-dups 1024 2048 100
+
+rm -rf opentuner.db
+./transpose_opentuner.py --no-dups 2048 3072 20
+
+rm -rf opentuner.db
+./transpose_opentuner.py --no-dups 3072 4096 20
+
+rm -rf opentuner.db
+./transpose_opentuner.py --no-dups 4096 5120 16
+
+rm -rf opentuner.db
+./transpose_opentuner.py --no-dups 5120 6144 12
+
+rm -rf opentuner.db
+./transpose_opentuner.py --no-dups 6144 7168 8
+
+rm -rf opentuner.db
+./transpose_opentuner.py --no-dups 7168 8192 6
+```
+
+The tuning script uses the environment variables `LIBXSMM_TCOPY_M` and `LIBXSMM_TCOPY_N`, which are internal to LIBXSMM. These variables are used to adjust certain thresholds in `libxsmm_otrans` or to request a specific tiling-scheme inside of the `libxsmm_otrans_omp` routine.
+
+## XGEMM: Tiled GEMM Routines
+
+### Overview
+
+This sample code calls the `libxsmm_?gemm_omp` routines provided by the LIBXSMM extension library (`libxsmmext`). These routines are meant for big(ger) xGEMM routines, and thereby provide an OpenMP-based parallelization.
+
+The driver program (`xgemm.c`) currently accepts all typical GEMM arguments (except for the transposition specifier): `m`, `n`, `k`, `lda`, `ldb`, `ldc`, `alpha`, and `beta`. All arguments are optional (or will inherit defaults from previously specified arguments). Matrix transposition as part of the `libxsmm_?gemm_omp` routines will become available in an upcoming release of LIBXSMM. Please also note that unsupported Alpha or Beta values will cause a fall back to the related BLAS routine. The single-precision matrix multiplications require to change the `ITYPE` in `xgemm.c`.
+
+```bash
+./xgemm.sh 2000
+```
+
+### OpenTuner
+
+To tune the tile sizes ("block sizes") internal to LIBXSMM, the [OpenTuner](http://opentuner.org/) extensible framework for program autotuning can be used. In case of issues during the tuning phase ("no value has been set for this column"), please install the latest 1.2.x revision of SQLAlchemy (`pip install sqlalchemy==1.2.19`). A tuning script (`xgemm_opentuner.py`) is provided, which optionally accepts a list of grouped parameters as command line arguments. The syntax of the arguments is per LIBXSMM's `MNK` build-option, and expands to "triplets" specifying the matrix shapes. For instance, four matrix multiplications of square-matrices can be benchmarked and tuned using the following command.
+
+```bash
+./xgemm_opentuner.py 1024,1280,1536,1792
+```
+
+To start a tuning experiment for a new set of arguments, it is highly recommended to start from scratch. Otherwise the population of previously generated tuning results is fetched from a database and used to tune an unrelated range of matrix shapes. Optionally, the initial block size can be seeded (`tile-size-m`, `tile-size-n`, and `tile-size-k`).
+
+```bash
+rm -rf opentuner.db
+```
+
+The script tunes the geometric mean of the performance for each of the requested triplets. However, the optimizer not only maximizes the performance but also minimizes the value of *M&#160;\*&#160;N&#160;\*&#160;K* (which also helps to prune duplicated results due to an additional preference). As a limitation of the current implementation, the multiplication kernels are not accompanied by copy-kernels (and not accompanied by transpose kernels). This negatively impacts performance on power-of-two matrix shapes (POT) due to trashing the LLC. However, it has been found, that tuning for POT shapes likely achieves superior performance when compared to tuning for non-POT shapes of the same range.
+
+```bash
+rm -rf opentuner.db
+./xgemm_opentuner.py --no-dups 192,256,320,512,768
+
+rm -rf opentuner.db
+./xgemm_opentuner.py --no-dups 1024,1280,1536,1792
+
+rm -rf opentuner.db
+./xgemm_opentuner.py --no-dups 2048,2304,2560,2816
+
+rm -rf opentuner.db
+./xgemm_opentuner.py --no-dups 3072,3328,3584,3840
+
+rm -rf opentuner.db
+./xgemm_opentuner.py --no-dups 4096,4416,4736
+
+rm -rf opentuner.db
+./xgemm_opentuner.py --no-dups 5120,5440,5760
+
+rm -rf opentuner.db
+./xgemm_opentuner.py --no-dups 6144,6464,6784
+
+rm -rf opentuner.db
+./xgemm_opentuner.py --no-dups 7168,7488,7808
+```
+
+Above, the series of matrix multiplications from 192-8K is separately tuned in eight ranges. The tuning script uses the environment variables `LIBXSMM_TGEMM_M`, `LIBXSMM_TGEMM_N`, and `LIBXSMM_TGEMM_K` which are internal to LIBXSMM. These variables are used to request a specific tiling-scheme within LIBXSMM's `libxsmm_?gemm_omp` routines.
+
+
+This package contains the optimized kernels for the 1D dilated convolutional layer. 
+The C++ implementation has code for both FP32 and BF16 formats.
+You can run this code on AVX-512 enabled CPUs. Ex. - Cascade Lake or Cooper lake.
+
+ Install instructions 
+
+IInstall PyTorch in an anaconda or virtual environment before installing the package.
+Use GCC version 8.3.0 or higher.
+conda activate environment              # Activate anaconda or virtual environment containing PyTorch
+
+cd Conv1dOpti-extension/
+python setup.py install                 # Install package
+cd ..
+
+
+A user can either use run.sh script to run the torch_example.py code or
+he/she can follow the following commands.  
+
+export LD_LIBRARY_PATH={LIBXSMM_ROOT/lib}           # Set LD_LIBRARY_PATH
+export OMP_NUM_THREADS=28                           # Set number of threads
+export KMP_AFFINITY=compact,1,0,granularity=fine    # Set KMP affinity
+
+python torch_example.py                             # Run the pytorch example
+
+In the previous example, we compare "nn.Conv1d" layer with our optimized "Conv1dOpti" layer.
+The example shows how "nn.Conv1d" can be replaced with "Conv1dOpti" layer in a neural network
+without requiring any other change.
+The optimized python layer can be imported using "from Conv1dOpti_ext import Conv1dOpti" in python.
+The example checks the accuracy of the results and calculates the computation time of both layers.
+
+
+  Limitations of the current version 
+
+- Keep padding=0 in the options. The current layer doesn't do padding. Explicit padding is needed 
+  for the optimized convolutional layer. You can use the example for reference.
+- Optimized convolutional layer code can only run with stride = 1.
+- Similarly, apply the nonlinearity (Ex. ReLU) separately.  
+
+
+To run code in BFloat16, set enable_BF16 flag to True. BFloat16 code runs only when the parameters of 
+Input width, number of filters and input channels to the layer are even number.
+Ex. -  Filters = 16, Channels = 16, Input width = 60000, enable_BF16 = True   BF16 run
+If any of the previous parameter is odd number then code runs in FP32 format. 
+
+
+Keep batch size as multiple of ununtilized cores (Ex. - 28, 56, 84, 128 .... on a 28 core cascade lake)
+for optimal performance with the Conv1dOpti layer. Each batch will run on a seperate thread thus 
+performance may go down if some core are not free, or batch size is not equal to the number of free cores. 
+Keep the batch size as power of 2 with the MKLDNN backend (Conv1d) for optimal performance. # Deep Learning with GxM
+
+### Compiling and Building GxM
+
+1. Install Pre-requisite Libraries: Google logging module (glog), gflags, Google's data interchange format (Protobuf), OpenCV, LMDB
+2. In Makefile.config, set GXM_LIBRARY_PATH variable to the path containing above libraries
+3. In Makefile.config, set LIBXSMM_PATH variable to the path containing LIBXSMM library
+4. Set/clear other flags in Makefile.config as required (see associated comments in Makefile.config)
+5. source setup_env.sh
+6. make clean; make
+
+### Running GxM
+
+The network topology definitions directory is "model_zoo". Currently, it contains definitions for
+AlexNet (without LRN), ResNet-50, Inception v3 along with CIFAR10 and MNIST as simple test definitions.
+Each topology definition is in a .prototxt file. ResNet-50 can run with "dummy data", raw JPEG image data
+or with LMDB. Filenames indicate the data source along with the minibatch size. Inception v3 runs only with
+compressed LMDB data.
+
+The hyperparameter definitions for each topology are also in the corresponding directory under "model_zoo" in
+a .prototxt file with the suffix "solver". For a single-node, this file is called solver.prototxt. For multi-node
+the filename also contains the global minibatch size (=single node minibatch size x number of nodes);, e.g., solver_896.prototxt contains hyperparameters for MB=56 per node and 16 nodes. The "solver*" file also contains a
+flag that specifies whether to start execution from a checkpoint (and thus read load weights from the "./weights"
+directory) or from scratch; by default execution starts from scratch.
+
+Optimal parallelization of Convolutional layers in LIBXSMM happens when the number of OpenMP threads = MiniBatch.
+Therefore, on Xeon
+
+```bash
+export OMP_NUM_THREADS=<MiniBatch>
+export KMP_AFFINITY=compact,granularity=fine,1,0
+```
+
+The command line for a training run is:
+
+```bash
+./build/bin/gxm train <topology filename> <hyperparameter filename>
+```
+
+For example:
+
+```bash
+./build/bin/gxm train model_zoo/resnet/1_resnet50_dummy_56.prototxt model_zoo/resnet/solver.prototxt
+```
+
+### Preping on RHEL 8.0 / CentOS 8.0
+
+```bash
+dnf install protobuf
+wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/protobuf-compiler-3.5.0-7.el8.x86_64.rpm
+dnf install protobuf-compiler-3.5.0-7.el8.x86_64.rpm
+wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/protobuf-devel-3.5.0-7.el8.x86_64.rpm
+dnf install protobuf-devel-3.5.0-7.el8.x86_64.rpm
+dnf install lmdb
+dnf install lmdb-devel
+wget http://repo.okay.com.mx/centos/8/x86_64/release/opencv-devel-3.4.1-9.el8.x86_64.rpm
+wget http://repo.okay.com.mx/centos/8/x86_64/release/opencv-3.4.1-9.el8.x86_64.rpm
+dnf install opencv-3.4.1-9.el8.x86_64.rpm
+dnf install opencv-devel-3.4.1-9.el8.x86_64.rpm
+wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/gflags-2.1.2-6.el8.x86_64.rpm
+wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/gflags-devel-2.1.2-6.el8.x86_64.rpm
+dnf install gflags-2.1.2-6.el8.x86_64.rpm
+dnf install gflags-devel-2.1.2-6.el8.x86_64.rpm
+wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/glog-devel-0.3.5-3.el8.x86_64.rpm
+wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/glog-0.3.5-3.el8.x86_64.rpm
+dnf install glog-0.3.5-3.el8.x86_64.rpm
+dnf install glog-devel-0.3.5-3.el8.x86_64.rpm
+```
+
+Make sure that the makefile follows the OpenCV Ver 3 path!
+
+## DNN Training with Incremental Sparsification + Sparse JIT Kernels
+
+### This project contains code for the following DNN models
+
+1. Resnet - ported from [link](https://pytorch.org/vision/stable/models.html)
+2. Transformer - ported from [link](https://github.com/pytorch/fairseq)
+3. DLRM - ported from [link](https://github.com/facebookresearch/dlrm)
+4. PCL_MLP - A python extension of the `torch.nn.Linear` module that uses efficient sparse JIT kernels for matrix multiplication (supports forward, backward and update pass) - ported from [link](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/sparse_weight_mult)
+
+### Features
+
+1. Training scripts for all three models located at the root of each directory in a form of a shell file
+2. By specifying each of the four parameters, the pruning criteria (magnitude-based or random-based), the pruning start time and end time and target sparsity you can apply incremental sparsity to model weights for training
+3. Additionally, by specifying a tensorboard log directory, one can examine training logs and metrics using tensorboard.
+
+### Data preparation
+
+Each model requires an extensive amount of data to be properly stress-tested against incremental sparsity. According to [The State of Sparsity](https://arxiv.org/abs/1902.09574) and by extensive experimentation, using a relatively small dataset or an overparameterized model may lead to false performance implications. For instance, when a ResNet-50 model is trained with the CIFAR-10 dataset or if the base Transformer is trained with a limited sentence pair dataset (i.e., EN-VI) it may seem as if the model isn't impacted even with extremely high sparsity since the model was overdetermined to begin with.
+
+- For Resnet
+- For Resnet training, a smaller subset of ImageNet was used, called ImageNette due to its massiveness in size. Download from [here](https://github.com/fastai/imagenette). 
+- For Transformer
+- As a neural machine translation task, the transformer model requires the WMT2014 EN_DE dataset. Preprocessing steps are described [here](https://fairseq.readthedocs.io/en/latest/getting_started.html#data-pre-processing)
+- For DLRM
+- Training the DLRM requires the terabyte dataset [link](https://labs.criteo.com/2013/12/download-terabyte-click-logs/)
+
+### Running scripts
+
+Each project consists of two scripts: a script that launches `sbatch` scripts for experimenting various target sparsities (usually named as `launch_pruning_runs.sh`)and a script that runs a single experiment. Use accordingly.
+
+1. ResNet model
+`./launch_pruning_jobs.sh ${TARGET_SPARSITY}` or
+`python train.py ${TARGET_SPARSITY}`
+2.  Transformer(FAIRSEQ) model
+`./launch_pruning_runs.sh` or `./prune_en_de.sh ${TARGET_SPARSITY} ${PRUNE_TYPE} ${EMB}`
+where PRUNE_TYPE is either `magnitude` or `random` and EMB indicates whether the embedding portion is pruned alongside the weights
+3. DLRM model
+`./launch_pruning_runs.sh` or `./run_terabyte.sh ${TARGET_SPARSITY} ${PRUNE_TYPE}`
+where PRUNE_TYPE is either `magnitude` or `random` 
+## Xsmm LSTM
+
+This code may be integrated with Tensorflow to make use of LIBXSMM's LSTM. Support for creating a Python wheel and a pip package can be found in the [directory](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/tf_lstm_ops) as well.
+
+## Dispatch<a name="dispatch-microbenchmark"></a>
+
+### Microbenchmark
+
+This code sample benchmarks the performance of (1)&#160;the dispatch mechanism, and (2)&#160;the time needed to JIT-generate code for the first time. Both mechanisms are relevant when replacing GEMM calls (see [Call Wrapper](https://libxsmm.readthedocs.io/libxsmm_mm/#call-wrapper) section of the reference documentation), or in any case of calling LIBXSMM's native [GEMM functionality](https://libxsmm.readthedocs.io/libxsmm_mm/).
+
+**Command Line Interface (CLI)**
+
+* Optionally takes the number of dispatches/code-generations (default:&#160;10000).
+* Optionally takes the number of threads (default:&#160;1).
+
+**Measurements (Benchmark)**
+
+* Duration of an empty function call (serves as a reference timing).
+* Duration to find an already generated kernel (cached/non-cached).
+* Duration to JIT-generate a GEMM kernel.
+
+In case of a multi-threaded benchmark, the timings represent a highly contended request (worst case). For thread-scaling, it can be observed that read-only accesses (code dispatch) stay roughly with a constant duration whereas write-accesses (code generation) are serialized and hence the duration scales linearly with the number of threads.
+
+The [Fortran example](https://github.com/hfp/libxsmm/blob/master/samples/utilities/dispatch/dispatch.f) (`dispatch.f`) could use `libxsmm_dmmdispatch` (or similar) like the C code (`dispatch.c`) but intentionally shows the lower-level dispatch interface `libxsmm_xmmdispatch` and also omits using the LIBXSMM module. Not using the module confirms: the same task can be achieved by relying only on FORTRAN&#160;77 language level.
+
+### User-Data Dispatch
+
+Further, another [Fortran example](https://github.com/hfp/libxsmm/blob/master/samples/utilities/dispatch/dispatch_udt.f) about [user-data dispatch](https://libxsmm.readthedocs.io/libxsmm_aux/#user-data-dispatch) is not exactly a benchmark. Dispatching user-data containing multiple kernels can obviously save multiple singular dispatches. The C interface for dispatching user-data is designed to follow the same flow as the Fortran interface.
+
+## MHD Image I/O
+
+This code sample aims to provide a simple piece of code, which takes an image and produces a visual result using LIBXSMM's MHD image file I/O. Performing a single convolution is *not* a showcase of LIBXSMM's Deeplearning as the code only runs over a single image with one channel.
+LIBXSMM's CNNs are vectorized over image channels (multiple images) according to the native vector-width of the processor and otherwise fall back to a high-level implementation.
+
+**Note**: For high-performance deep learning, please refer to the collection of [CNN layer samples](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/cnnlayer).
+
+The executable can run with the following arguments (all arguments are optional):
+
+> mhd   [&lt;filename-in&gt;  [&lt;nrepeat&gt;  [&lt;kw&gt;  [&lt;kh&gt;]  [&lt;filename-out&gt;]]]]
+
+For stable timing (benchmark), the key operation (convolution) may be repeated (`nrepeat`). Further, `kw` and `kh` can specify the kernel-size of the convolution. The `filename-in` and `filename-out` name MHD-files used as input and output respectively. The `filename-in` may be a pseudo-file (that does not exist) but specify the image resolution of generated input (`w`[x`h`] where the file `wxh.mhd` stores the generated image data). To load an image from a familiar format (JPG, PNG, etc.), please have a look at [Meta Image File I/O](https://libxsmm.readthedocs.io/libxsmm_aux/#meta-image-file-io).
+
+## Scratch Memory Allocation (Microbenchmark)
+
+This code sample aims to benchmark the performance of the scratch memory allocation. This facility is a viable option to satisfy the need for temporary memory when using the DNN domain of LIBXSMM (small convolutions).  Although any kind of readable/writable buffer can be bound to a convolution handle, LIBXSMM's `libxsmm_aligned_scratch` features a thread-safe linear allocator mechanism which can help to lower allocation overhead.
+
+## Wrapped DGEMM
+
+This code sample is calling DGEMM and there is no dependency on the LIBXSMM API as it only relies on LAPACK/BLAS interface. Two variants are linked when building the source code: (1) code which is dynamically linked against LAPACK/BLAS, (2) code which is linked using `--wrap=`*symbol* as possible when using a GNU&#160;GCC compatible tool chain. For more information, see the [Call Wrapper](https://libxsmm.readthedocs.io/libxsmm_mm/#call-wrapper) section of the reference documentation.
+
+The same (source-)code will execute in three flavors when running `dgemm-test.sh`: (1) code variant which is dynamically linked against the originally supplied LAPACK/BLAS library, (2) code variant which is linked using the wrapper mechanism of the GNU&#160;GCC tool chain, and (3) the first code but using the LD_PRELOAD mechanism (available under Linux).
+
+**Command Line Interface (CLI)**
+
+* Optionally takes the number of repeated DGEMM calls
+* Shows the performance of the workload (wall time)
+
--- a/third_party/libxsmm/documentation/libxsmm_samples.pdf
+++ b/third_party/libxsmm/documentation/libxsmm_samples.pdf
--- a/third_party/libxsmm/documentation/libxsmm_tune.md
+++ b/third_party/libxsmm/documentation/libxsmm_tune.md
+## Customization
+
+### Intercepted Allocations<a name="scalable_malloc"></a>
+
+To improve thread-scalability and to avoid frequent memory allocation/deallocation, the [scratch memory allocator](libxsmm_aux.md#memory-allocation) can be leveraged by intercepting existing malloc/free calls. This facility is built into LIBXSMM's main library, but disabled at compile-time (by default); build with `make MALLOC=1` to permanently enable, or build with `make MALLOC=-1` to even require an environment variable `LIBXSMM_MALLOC=1` or an API-call (`libxsmm_set_malloc`). Both runtime settings allow an optional lower and/or an upper bound to select malloc-calls based on the size of the allocation. For the environment option, an extra variable is introduced, e.g., use `LIBXSMM_MALLOC=1 LIBXSMM_MALLOC_LIMIT=4m:1g`.
+
+```C
+void libxsmm_set_malloc(int enabled, const size_t* lo, const size_t* hi);
+int libxsmm_get_malloc(size_t* lo, size_t* hi);
+```
+
+Querying the status may return zero even if there was an attempt to enable this facility (limitation/experimental implementation). Please note, the regular [Scratch Memory API](libxsmm_aux.md#memory-allocation) (e.g., `libxsmm_[get|set]_scratch_limit`) and the related environment variables can apply as well (`LIBXSMM_SCRATCH_LIMIT`, `LIBXSMM_SCRATCH_POOLS`, `LIBXSMM_SCRATCH_SCALE`). If intercepted memory allocations are enabled, the scratch limit is adjusted by default to allow unlimited growth of the scratch domain. Further, an increased verbosity level can help to gain some insight (`LIBXSMM_VERBOSE=3`).
+
+Intercepting malloc/free is supported by linking LIBXSMM's static or shared main library. The latter of which can be used to intercept calls of an existing and unchanged binary (LD_PRELOAD mechanism). To statically link with LIBXSMM and to intercept existing malloc/free calls, the following changes to the application's link stage are recommended:
+
+```bash
+gcc [...] -Wl,--export-dynamic \
+  -Wl,--wrap=malloc,--wrap=calloc,--wrap=realloc \
+  -Wl,--wrap=memalign,--wrap=free \
+  /path/to/libxsmm.a
+```
+
+The main library causes a BLAS-dependency which may be already fulfilled for the application in question. However, if this is not the case (unresolved symbols), `libxsmmnoblas.a` must be linked in addition. Depending on the dependencies of the application, the link order may also need to be adjusted. Other i.e. a GNU-compatible compiler (as shown above), can induce additional requirements (compiler runtime libraries).
+
+**Note**: The Intel Compiler may need "libirc", i.e., `-lirc` in front of `libxsmm.a`. Linking LIBXSMM's static library may require above mentioned linker flags (`--wrap`) in particular when using Intel Fortran (IFORT) as a linker driver unless `CALL libxsmm_init()` is issued (or at least one symbol of LIBXSMM's main library is referenced; check with `nm application | grep libxsmm`). Linking the static library by using the GNU compiler does not strictly need special flags when linking the application.
+
+Linking the shared library form of LIBXSMM (`make STATIC=0`) has similar requirements with respect to the application but does not require `-Wl,--wrap` although `-Wl,--export-dynamic` is necessary if the application is statically linked (beside of LIBXSMM linked in a shared fashion). The LD_PRELOAD based mechanism does not need any changes to the link step of an application. However, `libxsmmnoblas` may be required if the application does not already link against BLAS.
+
+```bash
+LD_PRELOAD="libxsmm.so libxsmmnoblas.so"
+LD_LIBRARY_PATH=/path/to/libxsmm/lib:${LD_LIBRARY_PATH}
+LIBXSMM_MALLOC=1
+```
+
+**Note**: If the application already uses BLAS, of course `libxsmmnoblas` must not be used!
+
+The following code can be compiled and linked with `gfortran example.f -o example`:
+
+```fortran
+      PROGRAM allocate_test
+        DOUBLE PRECISION, ALLOCATABLE :: a(:), b(:), c(:)
+        INTEGER :: i, repeat = 100000
+        DOUBLE PRECISION :: t0, t1, d
+
+        ALLOCATE(b(16*1024))
+        ALLOCATE(c(16*1024))
+        CALL CPU_TIME(t0)
+        DO i = 1, repeat
+          ALLOCATE(a(16*1024*1024))
+          DEALLOCATE(a)
+        END DO
+        CALL CPU_TIME(t1)
+        DEALLOCATE(b)
+        DEALLOCATE(c)
+        d = t1 - t0
+
+        WRITE(*, "(A,F10.1,A)") "duration:", (1D3 * d), " ms"
+      END PROGRAM
+```
+
+Running with `LIBXSMM_VERBOSE=3 LIBXSMM_MALLOC=1 LD_PRELOAD=... LD_LIBRARY_PATH=... ./example` displays: `Scratch: 132 MB (mallocs=1, pools=1)` which shows the innermost allocation/deallocation was served by the scratch memory allocator.
+
+### Static Specialization
+
+By default, LIBXSMM uses the [JIT backend](index.md#jit-backend) which is automatically building optimized code (JIT=1). Matrix multiplication kernels can be also statically specialized at compile-time of the library (M, N, and K values). This mechanism also extends the interface of the library because function prototypes are included into both the C and FORTRAN interface.
+
+```bash
+make M="2 4" N="1" K="$(echo $(seq 2 5))"
+```
+
+The above example is generating the following set of (M,N,K) triplets:
+
+```bash
+(2,1,2), (2,1,3), (2,1,4), (2,1,5),
+(4,1,2), (4,1,3), (4,1,4), (4,1,5)
+```
+
+The index sets are in a loop-nest relationship (M(N(K))) when generating the indexes. Moreover, an empty index set resolves to the next non-empty outer index set of the loop nest (including to wrap around from the M to K set). An empty index set does not participate in the loop-nest relationship. Here is an example of generating multiplication routines which are "squares" with respect to M and N (N inherits the current value of the "M loop"):
+
+```bash
+make M="$(echo $(seq 2 5))" K="$(echo $(seq 2 5))"
+```
+
+An even more flexible specialization is possible by using the MNK variable when building the library. It takes a list of indexes which are eventually grouped (using commas):
+
+```bash
+make MNK="2 3, 23"
+```
+
+Each group of the above indexes is combined into all possible triplets generating the following set of (M,N,K) values:
+
+```bash
+(2,2,2), (2,2,3), (2,3,2), (2,3,3),
+(3,2,2), (3,2,3), (3,3,2), (3,3,3), (23,23,23)
+```
+
+Of course, both mechanisms (M/N/K and MNK based) can be combined by using the same command line (make). Static optimization and JIT can also be combined (no need to turn off the JIT backend).
+
+### User-Data Dispatch
+
+It can be desired to dispatch user-defined data, i.e., to query a value based on a key. This functionality can be used to, e.g., dispatch multiple kernels in one step if a code location relies on multiple kernels. This way, one can pay the cost of dispatch one time per task rather than according to the number of JIT-kernels used by this task. This functionality is detailed in the section about [Service Functions](libxsmm_aux.md#user-data-dispatch).
+
+### Targeted Compilation<a name="tuning"></a>
+
+Specifying a code path is not necessary if the JIT backend is not disabled. However, disabling JIT compilation, statically generating a collection of kernels, and targeting a specific instruction set extension for the entire library looks like:
+
+```bash
+make JIT=0 AVX=3 MNK="1 2 3 4 5"
+```
+
+The above example builds a library which cannot be deployed to anything else but the <span>Intel&#160;Knights&#160;Landing processor family&#160;("KNL")</span> or future <span>Intel&#160;Xeon</span> processors supporting foundational <span>Intel&#160;AVX&#8209;512</span> instructions (<span>AVX&#8209;512F</span>). The latter might be even more adjusted by supplying MIC=1 (along with AVX=3), however this does not matter since critical code is in inline assembly (and not affected). Similarly, SSE=0 (or JIT=0 without SSE or AVX build flag) employs an "arch-native" approach whereas AVX=1, AVX=2 (with FMA), and AVX=3 are specifically selecting the kind of <span>Intel&#160;AVX</span> code. Moreover, controlling the target flags manually or adjusting the code optimizations is also possible. The following example is GCC-specific and corresponds to OPT=3, AVX=3, and MIC=1:
+
+```bash
+make OPT=3 TARGET="-mavx512f -mavx512cd -mavx512er -mavx512pf"
+```
+
+An extended interface can be generated which allows to perform software prefetches. Prefetching data might be helpful when processing batches of matrix multiplications where the next operands are farther away or otherwise unpredictable in their memory location. The prefetch strategy can be specified similar as shown in the section [Generator Driver](libxsmm_be.md#generator-driver), i.e., by either using the number of the shown enumeration, or by exactly using the name of the prefetch strategy. The only exception is PREFETCH=1 which is automatically selecting a strategy per an internal table (navigated by CPUID flags). The following example is requesting the "AL2jpst" strategy:
+
+```bash
+make PREFETCH=8
+```
+
+The prefetch interface is extending the signature of all kernels by three arguments (pa, pb, and pc). These additional arguments are specifying the locations of the operands of the next multiplication (the next a, b, and c matrices). Providing unnecessary arguments in case of the three-argument kernels is not big a problem (beside of some additional call-overhead), however running a 3-argument kernel with more than three arguments and thereby picking up garbage data is misleading or disabling the hardware prefetcher (due to software prefetches). In this case, a misleading prefetch location is given plus an eventual page fault due to an out-of-bounds (garbage-)location.
+
+Further, a generated configuration ([template](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_config.h)) of the library encodes the parameters for which the library was built for (static information). This helps optimizing client code related to the library's functionality. For example, the LIBXSMM_MAX_\* and LIBXSMM_AVG_\* information can be used with the LIBXSMM_PRAGMA_LOOP_COUNT macro to hint loop trip counts when handling matrices related to the problem domain of LIBXSMM.
+
+### Auto-dispatch
+
+The function `libxsmm_?mmdispatch` helps amortizing the cost of the dispatch when multiple calls with the same M, N, and K are needed. The automatic code dispatch is orchestrating two levels:
+
+1. Specialized routine (implemented in assembly code),
+2. BLAS library call (fallback).
+
+Both levels are accessible directly, which allows to customize the code dispatch. The fallback level may be supplied by the <span>Intel&#160;Math&#160;Kernel&#160;Library&#160;(Intel&#160;MKL)&#160;11.2</span> DIRECT CALL feature.
+
+Further, a preprocessor symbol denotes the largest problem-size (*M* x *N* x *K*) that belongs to the first level, and therefore determines if a matrix multiplication falls back to BLAS. The problem-size threshold can be configured by using for example:
+
+```bash
+make THRESHOLD=$((60 * 60 * 60))
+```
+
+The maximum of the given threshold and the largest requested specialization refines the value of the threshold. Please note that explicitly JIT'ting and executing a kernel is possible and independent of the threshold. If a problem-size is below the threshold, dispatching the code requires to figure out whether a specialized routine exists or not.
+
+For statically generated code, the precision can be selected:
+
+```bash
+make PRECISION=2
+```
+
+The default preference is to generate and register both single and double-precision code (PRECISION=0). Specifying <span>PRECISION=1&#124;2</span> is generating and registering single-precision or double-precision code respectively.
+
+The automatic dispatch is highly convenient because existing GEMM calls can serve specialized kernels (even in a binary compatible fashion), however there is (and always will be) an overhead associated with looking up the code-registry and checking whether the code determined by the GEMM call is already JIT'ted or not. This lookup has been optimized with various techniques such as specialized CPU instructions to calculate CRC32 checksums, to avoid costly synchronization (needed for thread-safety) until it is ultimately known that the requested kernel is not yet JIT'ted, and by implementing a small thread-local cache of recently dispatched kernels. The latter of which can be adjusted in size (only power-of-two sizes) but also disabled:
+
+```bash
+make CACHE=0
+```
+
+Please note that measuring the relative cost of automatically dispatching a requested kernel depends on the kernel size (obviously smaller matrices are multiplied faster on an absolute basis), however smaller matrix multiplications are bottlenecked by memory bandwidth rather than arithmetic intensity. The latter implies the highest relative overhead when (artificially) benchmarking the very same multiplication out of the CPU-cache.
+
--- a/third_party/libxsmm/documentation/libxsmm_valid.md
+++ b/third_party/libxsmm/documentation/libxsmm_valid.md
+## Basic Tests
+
+To run basic [tests](http://libxsmm.readthedocs.io/#classic-library-abi):
+
+```bash
+make tests
+```
+
+Remember: a set of key-value pairs represents a single unique (re-)build (and test):
+
+```bash
+make STATIC=0 tests
+```
+
+There is a whole collection of test targets available (`test-cp2k`, `test-cpp`, `test-nek`). However, it is then better to rely on test-suites.
+
+## Test Suites
+
+It is possible to run tests like LIBXSMM's continuous integration ([https://travis-ci.org/hfp/libxsmm](https://travis-ci.org/hfp/libxsmm)):
+
+```bash
+scripts/tool_test.sh
+```
+
+The above command runs the entire collection ("scripts/tool_test.sh 0"). However, one test (of currently 11 tests) can be selected by number (1-11):
+
+```bash
+scripts/tool_test.sh 1
+```
+
+The suite itself can be also selected. For example, some DNN tests are described in `.test-dnn.yml`:
+
+```bash
+TESTSET=test-dnn scripts/tool_test.sh
+```
+
+In general, all key-value pairs valid for LIBXSMM's `make` can be given as part of the environment:
+
+```bash
+AVX=3 MIC=0 TESTSET=test-dnn scripts/tool_test.sh
+```
+
+Please note, the suite/test itself may be comprised of key-value pairs that take precedence.
+
+## CI Tests
+
+The `tool_test.sh` script is included in repository archives and releases i.e., it works for non-repository folders. In contrast, the Continuous Integration (CI) use case relies on the Git command being present and the folder being a Git-clone.
+
+Functionality
+
+* `[skip ci]` as part of a commit message will not trigger the CI agents, and tests are skipped for such a commit.
+* `[full ci]` as part of a commit message will trigger a full test even if the setup uses the "Fast CI" option.
+
+The "Fast CI" option is enabled per filename given as 2nd command line argument:
+
+```bash
+scripts/tool_test.sh 1 .fullci
+```
+
+In the above example, a file named `.fullci` may contain path/file patterns (wildcard format) triggering a full test if the files changed by the commit match any of the patterns.
+
+## Portability
+
+It is desirable to exercise portability and reliability of LIBXSMM's source code even on Non-Intel Architecture by the means of compilation, linkage, and generic tests. This section is *not* about Intel Architecture (or compatible). Successful compilation (or even running some of the tests successfully) does not mean LIBXSMM is valuable on that platform.
+
+Make sure to rely on `PLATFORM=1`, otherwise a compilation error should occur _Intel Architecture or compatible CPU required!_ This error avoids (automated) attempts to upstream LIBXSMM to an unsupported platform. LIBXSMM is upstreamed for Intel Architecture on all major Linux distributions, FreeBSD, and others. If compilation fails with _LIBXSMM is only supported on a 64-bit platform!_, `make PLATFORM=1 DBG=1` can be used to exercise compilation.
+
+If platform support is forced (`PLATFORM=1`), runtime code generation is disabled at compile-time (`JIT=0`). Runtime code generation can be also enabled (`PLATFORM=1 JIT=1`) but code-dispatch will still return NULL-kernels. However, some tests will start failing as missing JIT-support it is not signaled at compile-time as with `JIT=0`.
+
+**Note**: JIT-support normally guarantees a non-NULL code pointer ("kernel") if the request is according to the [limitations](https://github.com/hfp/libxsmm/wiki/Q&A#what-is-a-small-matrix-multiplication) (user-code is not asked to check for a NULL-kernel), which does not hold true if JIT is enabled on a platform that does not implement it.
+
+### TinyCC
+
+The Tiny C Compiler (TinyCC) supports Intel Architecture, but lacks at least support for thread-local storage (TLS).
+
+```bash
+make CC=tcc THREADS=0 INTRINSICS=0 VLA=0 ASNEEDED=0 BLAS=0 FORCE_CXX=0
+```
+
+### IBM XL Compiler for Linux (POWER)
+
+The POWER platform requires aforementioned `PLATFORM=1` to unlock compilation.
+
+```bash
+make PLATFORM=1 CC=xlc CXX=xlc++ FC=xlf
+```
+
+### Cross-compilation for ARM
+
+ARM AArch64 is regularly [supported](https://github.com/hfp/libxsmm/wiki/Compatibility#arm-aarch64). However, 32-bit ARM requires aforementioned `PLATFORM=1` to unlock compilation (similar to 32-bit Intel Architecture). Unlocking compilation for 32-bit ARM is not be confused with supporting 32-bit ARM architectures.
+
+```bash
+make PLATFORM=1 AR=arm-linux-gnueabi-ar \
+  FC=arm-linux-gnueabi-gfortran \
+  CXX=arm-linux-gnueabi-g++ \
+  CC=arm-linux-gnueabi-gcc
+```
--- a/third_party/libxsmm/ide/_vs2019-configure.bat
+++ b/third_party/libxsmm/ide/_vs2019-configure.bat
+@ECHO OFF
+SETLOCAL
+
+ECHO ================================================================================
+ECHO One-time configuration (Cygwin w/ GNU GCC, GNU Make, and Python needed in PATH)
+ECHO When configured, it is sufficient to start _vs2019.bat or _vs2019.sln
+ECHO IMPORTANT: due to zero-config, configuration is not necessary anymore!
+ECHO            One may terminate this configuration (CTRL-C)
+ECHO            and simply start _vs2019.bat or _vs2019.sln.
+PAUSE
+cd ..
+bash -c "make realclean ; make headers sources"
+cd ide
+
+CALL %~d0"%~p0"_vs2019.bat
+
+ENDLOCAL
\ No newline at end of file
--- a/third_party/libxsmm/ide/libxsmm_generator_gemm_driver.vcxproj
+++ b/third_party/libxsmm/ide/libxsmm_generator_gemm_driver.vcxproj
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="debug|Win32">
+      <Configuration>debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="debug|x64">
+      <Configuration>debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="symbols|Win32">
+      <Configuration>symbols</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="symbols|x64">
+      <Configuration>symbols</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="release|Win32">
+      <Configuration>release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="release|x64">
+      <Configuration>release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\src\libxsmm_generator_gemm_driver.c" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\Makefile" />
+    <None Include="..\Makefile.inc" />
+    <None Include="..\README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectName>libxsmm_generator_gemm_driver</ProjectName>
+    <ProjectGuid>{47EDE325-4516-48DA-862B-F689F12DDBD3}</ProjectGuid>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <ProfileGuidedOptimization>Disabled</ProfileGuidedOptimization>
+    <CodeCoverageOperations>Disabled</CodeCoverageOperations>
+    <PlatformToolset>v142</PlatformToolset>
+    <VCToolsVersion />
+    <InstrumentIntelTBB>true</InstrumentIntelTBB>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <InterproceduralOptimization>true</InterproceduralOptimization>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <ProfileGuidedOptimization>Disabled</ProfileGuidedOptimization>
+    <CodeCoverageOperations>Disabled</CodeCoverageOperations>
+    <PlatformToolset>v142</PlatformToolset>
+    <VCToolsVersion />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='symbols|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <InterproceduralOptimization>true</InterproceduralOptimization>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <ProfileGuidedOptimization>Disabled</ProfileGuidedOptimization>
+    <CodeCoverageOperations>Disabled</CodeCoverageOperations>
+    <PlatformToolset>v142</PlatformToolset>
+    <VCToolsVersion />
+    <InstrumentIntelTBB>true</InstrumentIntelTBB>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <ProfileGuidedOptimization>Disabled</ProfileGuidedOptimization>
+    <CodeCoverageOperations>Disabled</CodeCoverageOperations>
+    <PlatformToolset>v142</PlatformToolset>
+    <VCToolsVersion />
+    <InstrumentIntelTBB>true</InstrumentIntelTBB>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='release|x64'" Label="Configuration">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <ConfigurationType>Application</ConfigurationType>
+    <InterproceduralOptimization>true</InterproceduralOptimization>
+    <ProfileGuidedOptimization>Disabled</ProfileGuidedOptimization>
+    <CodeCoverageOperations>Disabled</CodeCoverageOperations>
+    <PlatformToolset>v142</PlatformToolset>
+    <VCToolsVersion />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='symbols|x64'" Label="Configuration">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <ConfigurationType>Application</ConfigurationType>
+    <InterproceduralOptimization>true</InterproceduralOptimization>
+    <ProfileGuidedOptimization>Disabled</ProfileGuidedOptimization>
+    <CodeCoverageOperations>Disabled</CodeCoverageOperations>
+    <InstrumentIntelTBB>true</InstrumentIntelTBB>
+    <PlatformToolset>v142</PlatformToolset>
+    <VCToolsVersion />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='symbols|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='symbols|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='release|Win32'">bin\ia32\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='symbols|Win32'">bin\ia32\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='release|Win32'">obj\$(Platform)-$(Configuration)\$(ProjectName)\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='symbols|Win32'">obj\$(Platform)-$(Configuration)\$(ProjectName)\</IntDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='release|x64'">bin\intel64\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='symbols|x64'">bin\intel64\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='release|x64'">obj\$(Platform)-$(Configuration)\$(ProjectName)\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='symbols|x64'">obj\$(Platform)-$(Configuration)\$(ProjectName)\</IntDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='debug|Win32'">bin\ia32\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='debug|Win32'">obj\$(Platform)-$(Configuration)\$(ProjectName)\</IntDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='debug|x64'">bin\intel64\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='debug|x64'">obj\$(Platform)-$(Configuration)\$(ProjectName)\</IntDir>
+    <ProfileDirectory Condition="'$(Configuration)|$(Platform)'=='debug|Win32'">obj\$(Platform)-$(Configuration)\$(ProjectName)\</ProfileDirectory>
+    <ProfileDirectory Condition="'$(Configuration)|$(Platform)'=='release|Win32'">obj\$(Platform)-$(Configuration)\$(ProjectName)\</ProfileDirectory>
+    <ProfileDirectory Condition="'$(Configuration)|$(Platform)'=='debug|x64'">obj\$(Platform)-$(Configuration)\$(ProjectName)\</ProfileDirectory>
+    <ProfileDirectory Condition="'$(Configuration)|$(Platform)'=='release|x64'">obj\$(Platform)-$(Configuration)\$(ProjectName)\</ProfileDirectory>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='symbols|Win32'">
+    <TargetName>$(ProjectName)-$(Configuration)</TargetName>
+    <ProfileDirectory>obj\$(Platform)-$(Configuration)\$(ProjectName)\</ProfileDirectory>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='symbols|x64'">
+    <TargetName>$(ProjectName)-$(Configuration)</TargetName>
+    <ProfileDirectory>obj\$(Platform)-$(Configuration)\$(ProjectName)\</ProfileDirectory>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='release|Win32'" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='release|x64'" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='debug|Win32'">
+    <TargetName>$(ProjectName)-$(Configuration)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='debug|x64'">
+    <TargetName>$(ProjectName)-$(Configuration)</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='release|Win32'">
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <AdditionalIncludeDirectories>$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <WarningLevel>Level4</WarningLevel>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <UnmaskFloatingPointExceptions>NoTraps</UnmaskFloatingPointExceptions>
+      <FlushDenormalResultsToZero>true</FlushDenormalResultsToZero>
+      <EnableAnsiAliasing>true</EnableAnsiAliasing>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <OffloadConstructs>None</OffloadConstructs>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+      <OpenMPSupport>true</OpenMPSupport>
+      <DisableSpecificDiagnostics>3948,10373,10382</DisableSpecificDiagnostics>
+      <UseProcessorExtensions>HOST</UseProcessorExtensions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <ResourceCompile>
+      <Culture>0x0407</Culture>
+    </ResourceCompile>
+    <Link>
+      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
+      <LargeAddressAware>true</LargeAddressAware>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <SubSystem>Console</SubSystem>
+      <AdditionalLibraryDirectories>$(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <ProjectReference />
+    <ProjectReference />
+    <Lib>
+      <SubSystem>Console</SubSystem>
+    </Lib>
+    <BuildLog />
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='symbols|Win32'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <AdditionalIncludeDirectories>$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <WarningLevel>Level4</WarningLevel>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <UnmaskFloatingPointExceptions>NoTraps</UnmaskFloatingPointExceptions>
+      <FlushDenormalResultsToZero>true</FlushDenormalResultsToZero>
+      <EnableAnsiAliasing>true</EnableAnsiAliasing>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <OffloadConstructs>None</OffloadConstructs>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+      <OpenMPSupport>true</OpenMPSupport>
+      <InterproceduralOptimization>SingleFile</InterproceduralOptimization>
+      <DisableSpecificDiagnostics>3948,10373,10382</DisableSpecificDiagnostics>
+      <UseProcessorExtensions>HOST</UseProcessorExtensions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <ResourceCompile>
+      <Culture>0x0407</Culture>
+    </ResourceCompile>
+    <Link>
+      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
+      <LargeAddressAware>true</LargeAddressAware>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <SubSystem>Console</SubSystem>
+      <AdditionalLibraryDirectories>$(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+    <ProjectReference />
+    <ProjectReference />
+    <Lib>
+      <SubSystem>Console</SubSystem>
+    </Lib>
+    <BuildLog />
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='release|x64'">
+    <Midl>
+      <TargetEnvironment>X64</TargetEnvironment>
+    </Midl>
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <AdditionalIncludeDirectories>$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <WarningLevel>Level4</WarningLevel>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <UnmaskFloatingPointExceptions>NoTraps</UnmaskFloatingPointExceptions>
+      <FlushDenormalResultsToZero>true</FlushDenormalResultsToZero>
+      <EnableAnsiAliasing>true</EnableAnsiAliasing>
+      <OffloadConstructs>None</OffloadConstructs>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+      <OpenMPSupport>true</OpenMPSupport>
+      <DisableSpecificDiagnostics>3948,10373,10382</DisableSpecificDiagnostics>
+      <UseProcessorExtensions>HOST</UseProcessorExtensions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <ResourceCompile>
+      <Culture>0x0407</Culture>
+    </ResourceCompile>
+    <Link>
+      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <SubSystem>Console</SubSystem>
+      <AdditionalLibraryDirectories>$(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <ProjectReference />
+    <ProjectReference />
+    <Lib>
+      <SubSystem>Console</SubSystem>
+    </Lib>
+    <BuildLog />
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='symbols|x64'">
+    <Midl>
+      <TargetEnvironment>X64</TargetEnvironment>
+    </Midl>
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <AdditionalIncludeDirectories>$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <WarningLevel>Level4</WarningLevel>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <UnmaskFloatingPointExceptions>NoTraps</UnmaskFloatingPointExceptions>
+      <FlushDenormalResultsToZero>true</FlushDenormalResultsToZero>
+      <EnableAnsiAliasing>true</EnableAnsiAliasing>
+      <OffloadConstructs>None</OffloadConstructs>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+      <OpenMPSupport>true</OpenMPSupport>
+      <InterproceduralOptimization>SingleFile</InterproceduralOptimization>
+      <DisableSpecificDiagnostics>3948,10373,10382</DisableSpecificDiagnostics>
+      <UseProcessorExtensions>HOST</UseProcessorExtensions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <ResourceCompile>
+      <Culture>0x0407</Culture>
+    </ResourceCompile>
+    <Link>
+      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <SubSystem>Console</SubSystem>
+      <AdditionalLibraryDirectories>$(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+    <ProjectReference />
+    <ProjectReference />
+    <Lib>
+      <SubSystem>Console</SubSystem>
+    </Lib>
+    <BuildLog />
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <WarningLevel>Level4</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <OffloadConstructs>None</OffloadConstructs>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+      <OpenMPSupport>true</OpenMPSupport>
+      <DisableSpecificDiagnostics>3948,10373,10382</DisableSpecificDiagnostics>
+      <UseProcessorExtensions>HOST</UseProcessorExtensions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <ResourceCompile>
+      <Culture>0x0407</Culture>
+    </ResourceCompile>
+    <Link>
+      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LargeAddressAware>true</LargeAddressAware>
+      <DetectOneDefinitionRule>true</DetectOneDefinitionRule>
+      <SubSystem>Console</SubSystem>
+      <AdditionalLibraryDirectories>$(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <IgnoreSpecificDefaultLibraries>MSVCRT</IgnoreSpecificDefaultLibraries>
+    </Link>
+    <ProjectReference />
+    <ProjectReference />
+    <Lib>
+      <SubSystem>Console</SubSystem>
+    </Lib>
+    <BuildLog />
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='debug|x64'">
+    <Midl>
+      <TargetEnvironment>X64</TargetEnvironment>
+    </Midl>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>$(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>__SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <WarningLevel>Level4</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <OffloadConstructs>None</OffloadConstructs>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+      <OpenMPSupport>true</OpenMPSupport>
+      <DisableSpecificDiagnostics>3948,10373,10382</DisableSpecificDiagnostics>
+      <UseProcessorExtensions>HOST</UseProcessorExtensions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <ResourceCompile>
+      <Culture>0x0407</Culture>
+    </ResourceCompile>
+    <Link>
+      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <DetectOneDefinitionRule>true</DetectOneDefinitionRule>
+      <SubSystem>Console</SubSystem>
+      <AdditionalLibraryDirectories>$(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <IgnoreSpecificDefaultLibraries>MSVCRT</IgnoreSpecificDefaultLibraries>
+    </Link>
+    <ProjectReference />
+    <ProjectReference />
+    <Lib>
+      <SubSystem>Console</SubSystem>
+    </Lib>
+    <BuildLog />
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- a/third_party/libxsmm/include/.make
+++ b/third_party/libxsmm/include/.make
--- a/third_party/libxsmm/include/libxsmm.f
+++ b/third_party/libxsmm/include/libxsmm.f
+!=======================================================================!
+! Copyright (c) Intel Corporation - All rights reserved.                !
+! This file is part of the LIBXSMM library.                             !
+!                                                                       !
+! For information on the license, see the LICENSE file.                 !
+! Further information: https://github.com/hfp/libxsmm/                  !
+! SPDX-License-Identifier: BSD-3-Clause                                 !
+!=======================================================================!
+! Hans Pabst (Intel Corp.)
+!=======================================================================!
+
+      MODULE LIBXSMM
+        USE, INTRINSIC :: ISO_C_BINDING, ONLY:                          &
+     &    C_DOUBLE, C_FLOAT, C_DOUBLE_COMPLEX, C_FLOAT_COMPLEX,         &
+     &    C_LONG_LONG, C_INT, C_SHORT, C_CHAR, C_INT8_T, C_BOOL,        &
+     &    C_F_POINTER, C_ASSOCIATED, C_LOC, C_PTR,                      &
+     &    C_FUNPTR, C_NULL_FUNPTR, C_NULL_PTR
+        IMPLICIT NONE
+
+        !> Name of the version (stringized set of version numbers).
+        CHARACTER(*), PARAMETER :: LIBXSMM_VERSION = "1.16.1-1534"
+        !> Name of the branch of which the version is derived from.
+        CHARACTER(*), PARAMETER :: LIBXSMM_BRANCH = "master"
+        !> Major version based on the last reachable tag under RCS.
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_VERSION_MAJOR = 1
+        !> Minor version based on the last reachable tag of the RCS.
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_VERSION_MINOR = 16
+        !> Update number based on the last reachable tag under RCS.
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_VERSION_UPDATE = 1
+        !> Patch number counting commits since the last version stamp.
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_VERSION_PATCH = 1534
+
+        !> Parameters the library and static kernels were built for.
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_CACHELINE = 64
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_ALIGNMENT = 64
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_PREFETCH = -1
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_MAX_MNK = 262144
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_MAX_DIM = 64
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_FLAGS = 0
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_ILP64 = 0
+
+        !> Parameters supplied for backward compatibility (deprecated).
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_COL_MAJOR = 1
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_ROW_MAJOR = 0
+
+        !> LIBXSMM_BLASINT_KIND impacts BLAS interface (LP64: 32-bit, ILP64: 64-bit).
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_BLASINT_KIND = C_INT
+        !> Integer kind used by timer interface.
+        INTEGER(C_INT), PARAMETER :: LIBXSMM_TICKINT_KIND = C_LONG_LONG
+
+        !> Parameters representing the GEMM performed by the simplified interface.
+        REAL(C_DOUBLE), PARAMETER :: LIBXSMM_ALPHA = REAL(1, C_DOUBLE)
+        REAL(C_DOUBLE), PARAMETER :: LIBXSMM_BETA = REAL(1, C_DOUBLE)
+
+        !> Flag enumeration which can be IORed.
+        INTEGER(C_INT), PARAMETER ::                                    &
+     &    LIBXSMM_GEMM_FLAG_NONE     = 0,                               &
+     &    LIBXSMM_GEMM_FLAG_TRANS_A  = 1,                               &
+     &    LIBXSMM_GEMM_FLAG_TRANS_B  = 2,                               &
+     &    LIBXSMM_GEMM_FLAG_TRANS_AB = IOR(                             &
+     &        LIBXSMM_GEMM_FLAG_TRANS_A, LIBXSMM_GEMM_FLAG_TRANS_B),    &
+     &    LIBXSMM_GEMM_FLAG_BETA_0   = 16,                              &
+     &    LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT = 2176,                    &
+     &    LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0 = IOR(              &
+     &        LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT,                       &
+     &        LIBXSMM_GEMM_FLAG_BETA_0)
+
+        !> Flag enumeration which can be IORed.
+        INTEGER(C_INT), PARAMETER ::                                    &
+          ! Handle recorded batch unsynchronized-parallel.
+     &    LIBXSMM_MMBATCH_FLAG_DEFAULT      = 0,                        &
+          ! Synchronize among C matrices.
+     &    LIBXSMM_MMBATCH_FLAG_SYNCHRONIZED = 512,                      &
+          ! Handle recorded batch sequentially.
+     &    LIBXSMM_MMBATCH_FLAG_SEQUENTIAL   = 1024,                     &
+          ! Only record a statistic of potential SMMs.
+     &    LIBXSMM_MMBATCH_FLAG_STATISTIC    = 2048
+
+        !> Enumerates element/data types.
+        INTEGER(C_INT), PARAMETER ::                                    &
+     &    LIBXSMM_DATATYPE_F64  = 0,                                    &
+     &    LIBXSMM_DATATYPE_F32  = 1,                                    &
+     &    LIBXSMM_DATATYPE_BF16 = 2,                                    &
+     &    LIBXSMM_DATATYPE_I64  = 3,                                    &
+     &    LIBXSMM_DATATYPE_I32  = 4,                                    &
+     &    LIBXSMM_DATATYPE_I16  = 5,                                    &
+     &    LIBXSMM_DATATYPE_I8   = 6,                                    &
+     &    LIBXSMM_DATATYPE_UNSUPPORTED = 7
+
+        !> Denotes the precision/data type of GEMM (for weak-typed
+        !> interface functions such as libxsmm_xmmdispatch).
+        INTEGER(C_INT), PARAMETER ::                                    &
+     &    LIBXSMM_GEMM_PRECISION_F64  = LIBXSMM_DATATYPE_F64,           &
+     &    LIBXSMM_GEMM_PRECISION_F32  = LIBXSMM_DATATYPE_F32,           &
+     &    LIBXSMM_GEMM_PRECISION_BF16 = LIBXSMM_DATATYPE_BF16,          &
+     &    LIBXSMM_GEMM_PRECISION_I32  = LIBXSMM_DATATYPE_I32,           &
+     &    LIBXSMM_GEMM_PRECISION_I16  = LIBXSMM_DATATYPE_I16,           &
+     &    LIBXSMM_GEMM_PRECISION_I8   = LIBXSMM_DATATYPE_I8
+
+        !> Enumeration of the available prefetch strategies which can be IORed.
+        INTEGER(C_INT), PARAMETER ::                                    &
+          ! Automatically select strategy (frontend).
+     &    LIBXSMM_PREFETCH_AUTO                     = -1,               &
+          ! No prefetching and no prefetch function signature.
+     &    LIBXSMM_PREFETCH_NONE                     = 0,                &
+          ! Only function prefetch signature.
+     &    LIBXSMM_PREFETCH_SIGONLY                  = 1,                &
+          ! Prefetch PA using accesses to A.
+     &    LIBXSMM_GEMM_PREFETCH_AL2                 = 2,                &
+          ! Prefetch PB using accesses to C.
+     &    LIBXSMM_GEMM_PREFETCH_BL2_VIA_C           = 4,                &
+          ! Prefetch A ahead.
+     &    LIBXSMM_GEMM_PREFETCH_AL2_AHEAD           = 8,                &
+          ! Composed prefetch strategies.
+     &    LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C        = IOR(              &
+     &        LIBXSMM_GEMM_PREFETCH_BL2_VIA_C,                          &
+     &        LIBXSMM_GEMM_PREFETCH_AL2),                               &
+     &    LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD  = IOR(              &
+     &        LIBXSMM_GEMM_PREFETCH_BL2_VIA_C,                          &
+     &        LIBXSMM_GEMM_PREFETCH_AL2_AHEAD),                         &
+          ! Current B into L1.
+     &    LIBXSMM_GEMM_PREFETCH_BL1                 = 16
+
+        !> Enumerates the available target architectures and instruction
+        !> set extensions as returned by libxsmm_get_target_archid().
+        INTEGER(C_INT), PARAMETER ::                                    &
+     &    LIBXSMM_TARGET_ARCH_UNKNOWN = 0,                              &
+     &    LIBXSMM_TARGET_ARCH_GENERIC = 1,                              &
+     &    LIBXSMM_X86_GENERIC     = 1002,                               &
+     &    LIBXSMM_X86_SSE3        = 1003,                               &
+     &    LIBXSMM_X86_SSE4        = 1004,                               &
+     &    LIBXSMM_X86_AVX         = 1005,                               &
+     &    LIBXSMM_X86_AVX2        = 1006,                               &
+     &    LIBXSMM_X86_AVX512      = 1007,                               &
+     &    LIBXSMM_X86_AVX512_MIC  = 1010,                               &
+     &    LIBXSMM_X86_AVX512_KNM  = 1011,                               &
+     &    LIBXSMM_X86_AVX512_CORE = 1020,                               &
+     &    LIBXSMM_X86_AVX512_CLX  = 1021,                               &
+     &    LIBXSMM_X86_AVX512_CPX  = 1022
+
+        !> Generic function type (double-precision).
+        TYPE, BIND(C) :: LIBXSMM_DMMFUNCTION
+          TYPE(C_FUNPTR) :: handle = C_NULL_FUNPTR
+        END TYPE
+
+        !> Generic function type (single-precision).
+        TYPE, BIND(C) :: LIBXSMM_SMMFUNCTION
+          TYPE(C_FUNPTR) :: handle = C_NULL_FUNPTR
+        END TYPE
+
+        !> Generic function type (low-precision)
+        TYPE, BIND(C) :: LIBXSMM_WIMMFUNCTION
+          TYPE(C_FUNPTR) :: handle = C_NULL_FUNPTR
+        END TYPE
+
+        !> Generic function types with certain arity.
+        ABSTRACT INTERFACE
+          PURE SUBROUTINE LIBXSMM_FUNCTION3(a, b, c) BIND(C)
+            IMPORT :: C_PTR
+            TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c
+          END SUBROUTINE
+
+          PURE SUBROUTINE LIBXSMM_FUNCTION6(a, b, c, pa, pb, pc) BIND(C)
+            IMPORT :: C_PTR
+            TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c
+            TYPE(C_PTR), INTENT(IN), VALUE :: pa, pb, pc
+          END SUBROUTINE
+        END INTERFACE
+
+        !> Structure of differences with matrix norms according
+        !> to http://www.netlib.org/lapack/lug/node75.html).
+        TYPE, BIND(C) :: LIBXSMM_MATDIFF_INFO
+          REAL(C_DOUBLE) norm1_abs, norm1_rel !! One-norm
+          REAL(C_DOUBLE) normi_abs, normi_rel !! Infinity-norm
+          REAL(C_DOUBLE) normf_rel            !! Froebenius-norm
+          !> Maximum difference, L2-norm (absolute and relative), and R-squared.
+          REAL(C_DOUBLE) linf_abs, linf_rel, l2_abs, l2_rel, rsq
+          !> Statistics: sum/l1, min., max., arith. avg., and variance.
+          REAL(C_DOUBLE) l1_ref, min_ref, max_ref, avg_ref, var_ref
+          !> Statistics: sum/l1, min., max., arith. avg., and variance.
+          REAL(C_DOUBLE) l1_tst, min_tst, max_tst, avg_tst, var_tst
+          !> Values (v_ref, v_tst) and location (m, n) of largest linf_abs.
+          REAL(C_DOUBLE) v_ref, v_tst
+          !> Location (m, n) of largest difference (linf_abs).
+          INTEGER(LIBXSMM_BLASINT_KIND) m, n
+        END TYPE
+
+        INTERFACE
+          !> Initialize the library; pay for setup cost at a specific point.
+          SUBROUTINE libxsmm_init() BIND(C)
+          END SUBROUTINE
+
+          !> De-initialize the library and free internal memory (optional).
+          SUBROUTINE libxsmm_finalize() BIND(C)
+          END SUBROUTINE
+
+          !> Get the default prefetch strategy.
+          PURE FUNCTION libxsmm_get_gemm_auto_prefetch() BIND(C)
+            IMPORT :: C_INT
+            INTEGER(C_INT) :: libxsmm_get_gemm_auto_prefetch
+          END FUNCTION
+
+          !> Set the default prefetch strategy.
+          SUBROUTINE libxsmm_set_gemm_auto_prefetch(strategy) BIND(C)
+            IMPORT :: C_INT
+            INTEGER(C_INT), INTENT(IN), VALUE :: strategy
+          END SUBROUTINE
+
+          !> Returns the architecture and instruction set extension as determined
+          !> by the CPUID flags, as set by the libxsmm_get_target_arch* functions,
+          !> or as set by the LIBXSMM_TARGET environment variable.
+          PURE FUNCTION libxsmm_get_target_archid() BIND(C)
+            IMPORT :: C_INT
+            INTEGER(C_INT) :: libxsmm_get_target_archid
+          END FUNCTION
+
+          !> Set target architecture (archid: see PARAMETER enumeration)
+          !> for subsequent code generation (JIT).
+          SUBROUTINE libxsmm_set_target_archid(archid) BIND(C)
+            IMPORT :: C_INT
+            INTEGER(C_INT), INTENT(IN), VALUE :: archid
+          END SUBROUTINE
+
+          !> Set target architecture for subsequent code generation (JIT).
+          !> arch="0"|"sse"|"snb"|"hsw"|"knl"|"knm"|"skx"|"clx"|"cpx",
+          !> or "0" to rely on the CPUID (default).
+          !> There are some alternative target names as well:
+          !> "sse", "avx", "avx2", "avx3" (incomplete list).
+          SUBROUTINE libxsmm_set_target_arch(arch) BIND(C)
+            IMPORT :: C_CHAR
+            CHARACTER(C_CHAR), INTENT(IN) :: arch(*)
+          END SUBROUTINE
+
+          !> Get the level of verbosity.
+          PURE FUNCTION libxsmm_get_verbosity() BIND(C)
+            IMPORT :: C_INT
+            INTEGER(C_INT) :: libxsmm_get_verbosity
+          END FUNCTION
+
+          !> Set the level of verbosity (0: off, positive value: verbosity level,
+          !> negative value: maximum verbosity, which also dumps JIT-code).
+          SUBROUTINE libxsmm_set_verbosity(level) BIND(C)
+            IMPORT :: C_INT
+            INTEGER(C_INT), INTENT(IN), VALUE :: level
+          END SUBROUTINE
+
+          !> Impure function which returns the current clock tick of a
+          !> monotonic timer source; uses a platform-specific resolution.
+          !> Implicit FORTRAN 77 interface: not available.
+          INTEGER(LIBXSMM_TICKINT_KIND)                                 &
+     &    FUNCTION libxsmm_timer_tick() BIND(C)
+            IMPORT :: LIBXSMM_TICKINT_KIND
+          END FUNCTION
+
+          !> Impure function (timer freq. may vary) which returns the duration
+          !> (in seconds) between two values received by libxsmm_timer_tick.
+          !> Implicit FORTRAN 77 interface: not available.
+          FUNCTION libxsmm_timer_duration(tick0, tick1) BIND(C)
+            IMPORT :: LIBXSMM_TICKINT_KIND, C_DOUBLE
+            INTEGER(LIBXSMM_TICKINT_KIND), INTENT(IN), VALUE :: tick0
+            INTEGER(LIBXSMM_TICKINT_KIND), INTENT(IN), VALUE :: tick1
+            REAL(C_DOUBLE) :: libxsmm_timer_duration
+          END FUNCTION
+
+          !> Deallocates the JIT'ted code, or unregisters
+          !> and releases code from the registry.
+          !> Implicit FORTRAN 77 interface:
+          !> INTEGER(8) :: kernel
+          SUBROUTINE libxsmm_release_kernel(kernel)                     &
+     &    BIND(C, NAME="libxsmm_release_kernel_")
+            IMPORT :: C_FUNPTR
+            TYPE(C_FUNPTR), INTENT(IN) :: kernel
+          END SUBROUTINE
+
+          !> Type-generic (unsafe) code dispatch (trylock: impure routine).
+          !> Implicit FORTRAN 77 interface:
+          !> INTEGER(4)   :: gemm_precision, flags, prefetch
+          !> INTEGER(4|8) :: m, n, k, lda, ldb, ldc
+          !> REAL(4|8)    :: alpha, beta
+          !> INTEGER(8)   :: kernel
+          SUBROUTINE libxsmm_xmmdispatch(kernel, gemm_precision,        &
+     &    m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch)         &
+     &    BIND(C, NAME="libxsmm_xmmdispatch_")
+            IMPORT :: C_FUNPTR, C_PTR, C_INT, LIBXSMM_BLASINT_KIND
+            TYPE(C_FUNPTR), INTENT(OUT) :: kernel
+            INTEGER(C_INT), INTENT(IN)  :: gemm_precision
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+            TYPE(C_PTR), INTENT(IN), VALUE :: lda, ldb, ldc
+            TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta
+            TYPE(C_PTR), INTENT(IN), VALUE :: flags, prefetch
+          END SUBROUTINE
+
+          !> Type-generic (unsafe) code dispatch (trylock: impure routine).
+          !> Implicit FORTRAN 77 interface:
+          !> INTEGER(4)   :: iprec, oprec, flags, prefetch
+          !> INTEGER(4|8) :: m, n, k, lda, ldb, ldc
+          !> REAL(4|8)    :: alpha, beta
+          !> INTEGER(8)   :: kernel
+          SUBROUTINE libxsmm_xmmdispatch2(kernel, iprec, oprec,         &
+     &    m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch)         &
+     &    BIND(C, NAME="libxsmm_xmmdispatch2_")
+            IMPORT :: C_FUNPTR, C_PTR, C_INT, LIBXSMM_BLASINT_KIND
+            TYPE(C_FUNPTR), INTENT(OUT) :: kernel
+            INTEGER(C_INT), INTENT(IN)  :: iprec, oprec
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+            TYPE(C_PTR), INTENT(IN), VALUE :: lda, ldb, ldc
+            TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta
+            TYPE(C_PTR), INTENT(IN), VALUE :: flags, prefetch
+          END SUBROUTINE
+
+          !> Generic call routine (3-argument form).
+          !> Implicit FORTRAN 77 interface:
+          !> REAL(4|8)  :: a, b, c
+          !> INTEGER(8) :: kernel
+          PURE SUBROUTINE libxsmm_xmmcall_abc(kernel, a, b, c)          &
+     &    BIND(C, NAME="libxsmm_xmmcall_abc_")
+            IMPORT :: C_FUNPTR, C_PTR
+            TYPE(C_FUNPTR), INTENT(IN) :: kernel
+            TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c
+          END SUBROUTINE
+
+          !> Generic call routine (6-argument form).
+          !> Implicit FORTRAN 77 interface:
+          !> REAL(4|8)  :: a, b, c, pa, pb, pc
+          !> INTEGER(8) :: kernel
+          PURE SUBROUTINE libxsmm_xmmcall_prf(kernel,                   &
+     &    a, b, c, pa, pb, pc)                                          &
+     &    BIND(C, NAME="libxsmm_xmmcall_prf_")
+            IMPORT :: C_FUNPTR, C_PTR
+            TYPE(C_FUNPTR), INTENT(IN) :: kernel
+            TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c, pa, pb, pc
+          END SUBROUTINE
+
+          !> Fill destination with zeros; treats dst in raw/binary fashion.
+          SUBROUTINE libxsmm_xclear(dst, nbytes)                        &
+     &    BIND(C, NAME="libxsmm_xclear_")
+            IMPORT :: C_PTR, C_INT
+            TYPE(C_PTR), INTENT(IN), VALUE :: dst
+            INTEGER(C_INT), INTENT(IN) :: nbytes
+          END SUBROUTINE
+
+          !> Remove key-value pair from code registry and release memory.
+          SUBROUTINE libxsmm_xrelease(key, keysize)                     &
+     &    BIND(C, NAME="libxsmm_xrelease_")
+            IMPORT :: C_PTR, C_INT
+            TYPE(C_PTR), INTENT(IN), VALUE :: key
+            INTEGER(C_INT), INTENT(IN) :: keysize
+          END SUBROUTINE
+
+          !> Matrix-copy (2-dimensional copy) routine.
+          !> Implicit FORTRAN 77 interface:
+          !> ARRAY        :: input, output
+          !> INTEGER(4|8) :: m, n, ldi, ldo
+          !> INTEGER(4)   :: typesize
+          PURE SUBROUTINE libxsmm_xmatcopy(output, input, typesize,     &
+     &    m, n, ldi, ldo) BIND(C, NAME="libxsmm_matcopy_")
+            IMPORT :: LIBXSMM_BLASINT_KIND, C_PTR, C_INT
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo
+            TYPE(C_PTR), INTENT(IN), VALUE :: output, input
+            INTEGER(C_INT), INTENT(IN) :: typesize
+          END SUBROUTINE
+
+          !> Transpose a matrix (in-place form).
+          !> Implicit FORTRAN 77 interface:
+          !> ARRAY        :: matrix
+          !> INTEGER(4|8) :: m, n, ldi, ldo
+          !> INTEGER(4)   :: typesize
+          PURE SUBROUTINE libxsmm_xitrans(matrix, typesize,             &
+     &    m, n, ldi, ldo) BIND(C, NAME="libxsmm_itrans_")
+            IMPORT :: C_PTR, C_INT, LIBXSMM_BLASINT_KIND
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo
+            TYPE(C_PTR), INTENT(IN), VALUE :: matrix
+            INTEGER(C_INT), INTENT(IN) :: typesize
+          END SUBROUTINE
+
+          !> Transpose a matrix (out-of-place form).
+          !> Implicit FORTRAN 77 interface:
+          !> ARRAY        :: input, output
+          !> INTEGER(4|8) :: m, n, ldi, ldo
+          !> INTEGER(4)   :: typesize
+          PURE SUBROUTINE libxsmm_xotrans(output, input, typesize,      &
+     &    m, n, ldi, ldo) BIND(C, NAME="libxsmm_otrans_")
+            IMPORT :: C_PTR, C_INT, LIBXSMM_BLASINT_KIND
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo
+            TYPE(C_PTR), INTENT(IN), VALUE :: output, input
+            INTEGER(C_INT), INTENT(IN) :: typesize
+          END SUBROUTINE
+
+          !> Matrix copy; MT via libxsmmext (out-of-place form).
+          !> Implicit FORTRAN 77 interface:
+          !> ARRAY        :: output, input
+          !> INTEGER(4|8) :: m, n, ldi, ldo
+          !> INTEGER(4)   :: typesize
+          PURE SUBROUTINE libxsmm_matcopy_omp(output, input, typesize,  &
+     &    m, n, ldi, ldo) BIND(C, NAME="libxsmm_matcopy_omp_")
+            IMPORT :: C_PTR, C_INT, LIBXSMM_BLASINT_KIND
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo
+            TYPE(C_PTR), INTENT(IN), VALUE :: output, input
+            INTEGER(C_INT), INTENT(IN) :: typesize
+          END SUBROUTINE
+
+          !> Matrix transposition; MT via libxsmmext (out-of-place form).
+          !> Implicit FORTRAN 77 interface:
+          !> ARRAY        :: output, input
+          !> INTEGER(4|8) :: m, n, ldi, ldo
+          !> INTEGER(4)   :: typesize
+          PURE SUBROUTINE libxsmm_otrans_omp(output, input, typesize,   &
+     &    m, n, ldi, ldo) BIND(C, NAME="libxsmm_otrans_omp_")
+            IMPORT :: C_PTR, C_INT, LIBXSMM_BLASINT_KIND
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo
+            TYPE(C_PTR), INTENT(IN), VALUE :: output, input
+            INTEGER(C_INT), INTENT(IN) :: typesize
+          END SUBROUTINE
+
+          !> General dense MM; MT via libxsmmext (double-precision).
+          !> Implicit FORTRAN 77 interface: similar to DGEMM.
+          PURE SUBROUTINE libxsmm_dgemm_omp(transa, transb, m, n, k,    &
+     &    alpha, a, lda, b, ldb, beta, c, ldc)                          &
+     &    BIND(C, NAME="libxsmm_dgemm_omp_")
+            IMPORT :: C_DOUBLE, C_CHAR, LIBXSMM_BLASINT_KIND
+            CHARACTER(C_CHAR), INTENT(IN) :: transa, transb
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc
+            REAL(C_DOUBLE), INTENT(IN) :: alpha, beta
+            REAL(C_DOUBLE), INTENT(IN) :: a(lda,*), b(ldb,*)
+            REAL(C_DOUBLE), INTENT(INOUT) :: c(ldc,*)
+          END SUBROUTINE
+
+          !> General dense MM; MT via libxsmmext (single-precision).
+          !> Implicit FORTRAN 77 interface: similar to SGEMM.
+          PURE SUBROUTINE libxsmm_sgemm_omp(transa, transb, m, n, k,    &
+     &    alpha, a, lda, b, ldb, beta, c, ldc)                          &
+     &    BIND(C, NAME="libxsmm_sgemm_omp_")
+            IMPORT :: C_FLOAT, C_CHAR, LIBXSMM_BLASINT_KIND
+            CHARACTER(C_CHAR), INTENT(IN) :: transa, transb
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc
+            REAL(C_FLOAT), INTENT(IN)    :: alpha, beta
+            REAL(C_FLOAT), INTENT(IN)    :: a(lda,*), b(ldb,*)
+            REAL(C_FLOAT), INTENT(INOUT) :: c(ldc,*)
+          END SUBROUTINE
+
+          !> Process a series of MMs (batch). See also libxsmm_gemm_batch_omp.
+          !> The kind of matrix operands (a, b, c) depend on index_stride:
+          !> index_stride==0: pointers to pointers of elements, e.g.,
+          !> double** for the C matrices.
+          !> index_stride!=0: pointer to elements, e.g.,
+          !> const double* for the A and B matrices.
+          !> Implicit FORTRAN 77 interface:
+          !> INTEGER(4)   :: iprec, oprec
+          !> REAL(4|8)    :: alpha, beta
+          !> ARRAY        :: a, b, c
+          !> ARRAY/VALUE  :: stride_a, stride_b, stride_c
+          !> INTEGER(4|8) :: index_base, index_stride, batchsize
+          !> INTEGER(4)   :: tid, nthreads
+          !> Otherwise arguments are similar to GEMM.
+          PURE SUBROUTINE libxsmm_mmbatch(iprec, oprec, transa, transb, &
+     &    m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, index_base,     &
+     &    index_stride, stride_a, stride_b, stride_c, batchsize,        &
+     &    tid, nthreads)                                                &
+     &    BIND(C, NAME="libxsmm_mmbatch_")
+            IMPORT :: C_PTR, C_CHAR, C_INT, LIBXSMM_BLASINT_KIND
+            !> Determines index-base (usually 0, 1 for one-based indexes).
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_base
+            !> Stride (measured in Bytes) used to walk stride_*.
+            !> In Fortran: index_stride!=0.
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_stride
+            !> Number of SMMs. If the size is given as a negative value,
+            !> then internal synchronization is omitted.
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: batchsize
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc
+            CHARACTER(C_CHAR),  INTENT(IN) :: transa, transb
+            TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta
+            TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c
+            !> Arrays of indexes determining the position of
+            !> a, b, and c operands.
+            TYPE(C_PTR), INTENT(IN), VALUE :: stride_a
+            TYPE(C_PTR), INTENT(IN), VALUE :: stride_b
+            TYPE(C_PTR), INTENT(IN), VALUE :: stride_c
+            INTEGER(C_INT), INTENT(IN) :: iprec, oprec
+            !> Thread-ID (TID), and number of threads.
+            INTEGER(C_INT), INTENT(IN) :: tid, nthreads
+          END SUBROUTINE
+
+          !> Process a series of SMMs (batch). See also libxsmm_mmbatch.
+          !> Implicit FORTRAN 77 interface:
+          !> INTEGER(4)   :: iprec, oprec
+          !> REAL(4|8)    :: alpha, beta
+          !> ARRAY        :: a, b, c
+          !> ARRAY/VALUE  :: stride_a, stride_b, stride_c
+          !> INTEGER(4|8) :: index_base, index_stride, batchsize
+          !> Otherwise arguments are similar to GEMM.
+          PURE SUBROUTINE libxsmm_gemm_batch(iprec, oprec,              &
+     &    transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, &
+     &    index_base, index_stride, stride_a, stride_b, stride_c,       &
+     &    batchsize)                                                    &
+     &    BIND(C, NAME="libxsmm_gemm_batch_")
+            IMPORT :: C_PTR, C_CHAR, C_INT, LIBXSMM_BLASINT_KIND
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_base
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_stride
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: batchsize
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc
+            CHARACTER(C_CHAR),  INTENT(IN) :: transa, transb
+            TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta
+            TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c
+            TYPE(C_PTR), INTENT(IN), VALUE :: stride_a
+            TYPE(C_PTR), INTENT(IN), VALUE :: stride_b
+            TYPE(C_PTR), INTENT(IN), VALUE :: stride_c
+            INTEGER(C_INT), INTENT(IN) :: iprec, oprec
+          END SUBROUTINE
+
+          !> Process a series of SMMs (batch) with OpenMP (libxsmmext).
+          !> Implicit FORTRAN 77 interface:
+          !> INTEGER(4)   :: iprec, oprec
+          !> REAL(4|8)    :: alpha, beta
+          !> ARRAY        :: a, b, c
+          !> ARRAY/VALUE  :: stride_a, stride_b, stride_c
+          !> INTEGER(4|8) :: index_base, index_stride, batchsize
+          !> Otherwise arguments are similar to GEMM.
+          PURE SUBROUTINE libxsmm_gemm_batch_omp(iprec, oprec,          &
+     &    transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, &
+     &    index_base, index_stride, stride_a, stride_b, stride_c,       &
+     &    batchsize)                                                    &
+     &    BIND(C, NAME="libxsmm_gemm_batch_omp_")
+            IMPORT :: C_PTR, C_CHAR, C_INT, LIBXSMM_BLASINT_KIND
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_base
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_stride
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: batchsize
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc
+            CHARACTER(C_CHAR),  INTENT(IN) :: transa, transb
+            TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta
+            TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c
+            TYPE(C_PTR), INTENT(IN), VALUE :: stride_a
+            TYPE(C_PTR), INTENT(IN), VALUE :: stride_b
+            TYPE(C_PTR), INTENT(IN), VALUE :: stride_c
+            INTEGER(C_INT), INTENT(IN) :: iprec, oprec
+          END SUBROUTINE
+
+          !> This function is a no-op unless LIBXSMM is built to intercept GEMM.
+          !> Pointer arguments are used to filter intercepted GEMM calls such that
+          !> non-NULL values match. Otherwise (NULL) the respective argument is
+          !> considered a "free value", i.e., every value can match;
+          !> libxsmmext required.
+          !> Implicit FORTRAN 77 interface:
+          !> INTEGER(4)   :: gemm_precision, flags
+          !> INTEGER(4|8) :: m, n, k, lda, ldb, ldc
+          !> REAL(4|8)    :: alpha, beta
+          SUBROUTINE libxsmm_mmbatch_begin(gemm_precision, flags,       &
+     &    m, n, k,  lda, ldb, ldc, alpha, beta) BIND(C)
+            IMPORT :: C_PTR, C_INT, LIBXSMM_BLASINT_KIND
+            INTEGER(C_INT), INTENT(IN), VALUE :: gemm_precision
+            INTEGER(C_INT), INTENT(IN) :: flags
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+            INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc
+            TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta
+          END SUBROUTINE
+
+          !> Processes the batch of previously recorded SMMs
+          !> (libxsmm_mmbatch_begin); libxsmmext required.
+          !> Implicit FORTRAN 77 interface: available.
+          SUBROUTINE libxsmm_mmbatch_end() BIND(C)
+          END SUBROUTINE
+
+          !> Reduces input into output such that the difference is maintained
+          !> or increased (max function). The very first (initial) output
+          !> should be zeroed (libxsmm_matdiff_clear).
+          !> Implicit FORTRAN 77 interface: available.
+          PURE SUBROUTINE libxsmm_matdiff_reduce(output, input) BIND(C)
+            IMPORT :: LIBXSMM_MATDIFF_INFO
+            TYPE(LIBXSMM_MATDIFF_INFO), INTENT(INOUT) :: output
+            TYPE(LIBXSMM_MATDIFF_INFO), INTENT(IN)    :: input
+          END SUBROUTINE
+
+          !> Clears the given info-structure, e.g., for the initial
+          !> reduction-value (libxsmm_matdiff_reduce).
+          !> Implicit FORTRAN 77 interface: available.
+          PURE SUBROUTINE libxsmm_matdiff_clear(info) BIND(C)
+            IMPORT :: LIBXSMM_MATDIFF_INFO
+            TYPE(LIBXSMM_MATDIFF_INFO), INTENT(OUT) :: info
+          END SUBROUTINE
+
+          !> Calculates a hash value for the given array and seed.
+          !> Routine suitable for FORTRAN 77; keysize in Bytes.
+          PURE SUBROUTINE libxsmm_xhash(hash_seed, key, keysize)        &
+     &    BIND(C, NAME="libxsmm_xhash_")
+            IMPORT :: C_INT, C_PTR
+            INTEGER(C_INT), INTENT(INOUT)  :: hash_seed
+            INTEGER(C_INT), INTENT(IN)     :: keysize
+            TYPE(C_PTR), INTENT(IN), VALUE :: key
+          END SUBROUTINE
+
+          !> Calculates if there is a difference between two arrays.
+          !> Routine suitable for FORTRAN 77; size in Bytes.
+          PURE SUBROUTINE libxsmm_xdiff(diff, a, b, nbytes)             &
+     &    BIND(C, NAME="libxsmm_xdiff_")
+            IMPORT :: C_PTR, C_LONG_LONG, C_BOOL
+            TYPE(C_PTR), INTENT(IN), VALUE   :: a, b
+            INTEGER(C_LONG_LONG), INTENT(IN) :: nbytes
+            LOGICAL(C_BOOL), INTENT(OUT)     :: diff
+          END SUBROUTINE
+        END INTERFACE
+
+        INTERFACE libxsmm_ptr0
+          MODULE PROCEDURE libxsmm_ptr_z0, libxsmm_ptr_c0
+          MODULE PROCEDURE libxsmm_ptr_d0, libxsmm_ptr_s0
+          MODULE PROCEDURE libxsmm_ptr_i0, libxsmm_ptr_w0
+          MODULE PROCEDURE libxsmm_ptr_j0 !! Byte/char
+          MODULE PROCEDURE libxsmm_ptr_b0 !! Byte/char
+          MODULE PROCEDURE libxsmm_ptr_l0 !! long long
+        END INTERFACE
+
+        INTERFACE libxsmm_ptr1
+          MODULE PROCEDURE libxsmm_ptr_z1, libxsmm_ptr_c1
+          MODULE PROCEDURE libxsmm_ptr_d1, libxsmm_ptr_s1
+          MODULE PROCEDURE libxsmm_ptr_i1, libxsmm_ptr_w1
+          MODULE PROCEDURE libxsmm_ptr_j1 !! Byte/char
+          MODULE PROCEDURE libxsmm_ptr_b1 !! Byte/char
+          MODULE PROCEDURE libxsmm_ptr_l1 !! long long
+          MODULE PROCEDURE libxsmm_ptr_dmm
+          MODULE PROCEDURE libxsmm_ptr_smm
+          MODULE PROCEDURE libxsmm_ptr_wimm
+        END INTERFACE
+
+        INTERFACE libxsmm_ptr2
+          MODULE PROCEDURE libxsmm_ptr_z2, libxsmm_ptr_c2
+          MODULE PROCEDURE libxsmm_ptr_d2, libxsmm_ptr_s2
+          MODULE PROCEDURE libxsmm_ptr_i2, libxsmm_ptr_w2
+          MODULE PROCEDURE libxsmm_ptr_j2 !! Byte/char
+          MODULE PROCEDURE libxsmm_ptr_b2 !! Byte/char
+          MODULE PROCEDURE libxsmm_ptr_l2 !! long long
+        END INTERFACE
+
+        INTERFACE libxsmm_ptr
+          MODULE PROCEDURE libxsmm_ptr_z0, libxsmm_ptr_c0
+          MODULE PROCEDURE libxsmm_ptr_d0, libxsmm_ptr_s0
+          MODULE PROCEDURE libxsmm_ptr_i0, libxsmm_ptr_w0
+          MODULE PROCEDURE libxsmm_ptr_j0 !! Byte/char
+          MODULE PROCEDURE libxsmm_ptr_b0 !! Byte/char
+          MODULE PROCEDURE libxsmm_ptr_l0 !! long long
+          MODULE PROCEDURE libxsmm_ptr_z1, libxsmm_ptr_c1
+          MODULE PROCEDURE libxsmm_ptr_d1, libxsmm_ptr_s1
+          MODULE PROCEDURE libxsmm_ptr_i1, libxsmm_ptr_w1
+          MODULE PROCEDURE libxsmm_ptr_j1 !! Byte/char
+          MODULE PROCEDURE libxsmm_ptr_b1 !! Byte/char
+          MODULE PROCEDURE libxsmm_ptr_l1 !! long long
+          MODULE PROCEDURE libxsmm_ptr_z2, libxsmm_ptr_c2
+          MODULE PROCEDURE libxsmm_ptr_d2, libxsmm_ptr_s2
+          MODULE PROCEDURE libxsmm_ptr_i2, libxsmm_ptr_w2
+          MODULE PROCEDURE libxsmm_ptr_j2 !! Byte/char
+          MODULE PROCEDURE libxsmm_ptr_b2 !! Byte/char
+          MODULE PROCEDURE libxsmm_ptr_l2 !! long long
+          MODULE PROCEDURE libxsmm_ptr_dmm
+          MODULE PROCEDURE libxsmm_ptr_smm
+          MODULE PROCEDURE libxsmm_ptr_wimm
+        END INTERFACE
+
+        !> Deallocates JIT'ted code, or unregisters/releases code from registry.
+        INTERFACE libxsmm_release_mmkernel
+          MODULE PROCEDURE libxsmm_release_dmmkernel
+          MODULE PROCEDURE libxsmm_release_smmkernel
+          MODULE PROCEDURE libxsmm_release_wimmkernel
+        END INTERFACE
+
+        !> Construct JIT-code depending on given argument set.
+        INTERFACE libxsmm_mmdispatch
+          MODULE PROCEDURE libxsmm_dmmdispatch, libxsmm_smmdispatch
+          MODULE PROCEDURE libxsmm_wimmdispatch
+        END INTERFACE
+
+        !> Construct JIT-code depending on given argument set.
+        INTERFACE libxsmm_dispatch
+          MODULE PROCEDURE libxsmm_dmmdispatch, libxsmm_smmdispatch
+          MODULE PROCEDURE libxsmm_wimmdispatch
+        END INTERFACE
+
+        !> Check if a function is available (LIBXSMM_?MMFUNCTION).
+        INTERFACE libxsmm_mmavailable
+          MODULE PROCEDURE libxsmm_dmmavailable, libxsmm_smmavailable
+          MODULE PROCEDURE libxsmm_wimmavailable
+        END INTERFACE
+
+        !> Check if a function is available (LIBXSMM_?MMFUNCTION).
+        INTERFACE libxsmm_available
+          MODULE PROCEDURE libxsmm_smmavailable, libxsmm_dmmavailable
+          MODULE PROCEDURE libxsmm_wimmavailable
+        END INTERFACE
+
+        !> Overloaded GEMM routines (double-precision).
+        INTERFACE libxsmm_dgemm
+          MODULE PROCEDURE libxsmm_dgemm0
+          MODULE PROCEDURE libxsmm_dgemm1
+          MODULE PROCEDURE libxsmm_dgemm2
+          MODULE PROCEDURE libxsmm_dgemm3
+        END INTERFACE
+
+        !> Overloaded GEMM routines (single-precision).
+        INTERFACE libxsmm_sgemm
+          MODULE PROCEDURE libxsmm_sgemm0
+          MODULE PROCEDURE libxsmm_sgemm1
+          MODULE PROCEDURE libxsmm_sgemm2
+        END INTERFACE
+
+        !> Overloaded GEMM routines (low-precision).
+        INTERFACE libxsmm_wigemm
+          MODULE PROCEDURE libxsmm_wigemm0
+          MODULE PROCEDURE libxsmm_wigemm1
+          MODULE PROCEDURE libxsmm_wigemm2
+        END INTERFACE
+
+        !> Overloaded GEMM routines.
+        INTERFACE libxsmm_gemm
+          MODULE PROCEDURE libxsmm_dgemm0
+          MODULE PROCEDURE libxsmm_dgemm1
+          MODULE PROCEDURE libxsmm_dgemm2
+          MODULE PROCEDURE libxsmm_dgemm3
+          MODULE PROCEDURE libxsmm_sgemm0
+          MODULE PROCEDURE libxsmm_sgemm1
+          MODULE PROCEDURE libxsmm_sgemm2
+          MODULE PROCEDURE libxsmm_sgemm3
+          MODULE PROCEDURE libxsmm_wigemm0
+          MODULE PROCEDURE libxsmm_wigemm1
+          MODULE PROCEDURE libxsmm_wigemm2
+          MODULE PROCEDURE libxsmm_wigemm3
+        END INTERFACE
+
+        !> Overloaded BLAS GEMM routines (double-precision).
+        INTERFACE libxsmm_blas_dgemm
+          MODULE PROCEDURE libxsmm_blas_dgemm0
+          MODULE PROCEDURE libxsmm_blas_dgemm1
+          MODULE PROCEDURE libxsmm_blas_dgemm2
+          MODULE PROCEDURE libxsmm_blas_dgemm3
+        END INTERFACE
+
+        !> Overloaded BLAS GEMM routines (single-precision).
+        INTERFACE libxsmm_blas_sgemm
+          MODULE PROCEDURE libxsmm_blas_sgemm0
+          MODULE PROCEDURE libxsmm_blas_sgemm1
+          MODULE PROCEDURE libxsmm_blas_sgemm2
+          MODULE PROCEDURE libxsmm_blas_sgemm3
+        END INTERFACE
+
+        !> Overloaded BLAS GEMM routines (single/double-precision).
+        INTERFACE libxsmm_blas_gemm
+          MODULE PROCEDURE libxsmm_blas_dgemm0
+          MODULE PROCEDURE libxsmm_blas_dgemm1
+          MODULE PROCEDURE libxsmm_blas_dgemm2
+          MODULE PROCEDURE libxsmm_blas_dgemm3
+          MODULE PROCEDURE libxsmm_blas_sgemm0
+          MODULE PROCEDURE libxsmm_blas_sgemm1
+          MODULE PROCEDURE libxsmm_blas_sgemm2
+          MODULE PROCEDURE libxsmm_blas_sgemm3
+        END INTERFACE
+
+        !> Overloaded MATCOPY routines (2d-copy).
+        INTERFACE libxsmm_matcopy
+          MODULE PROCEDURE libxsmm_matcopy_p0
+          MODULE PROCEDURE libxsmm_matcopy_d1
+          MODULE PROCEDURE libxsmm_matcopy_d2
+          MODULE PROCEDURE libxsmm_matcopy_s1
+          MODULE PROCEDURE libxsmm_matcopy_s2
+        END INTERFACE
+
+        !> Overloaded TRANSPOSE routines (in-place form).
+        INTERFACE libxsmm_itrans
+          MODULE PROCEDURE libxsmm_itrans_p0
+          MODULE PROCEDURE libxsmm_itrans_d1
+          MODULE PROCEDURE libxsmm_itrans_d2
+          MODULE PROCEDURE libxsmm_itrans_s1
+          MODULE PROCEDURE libxsmm_itrans_s2
+        END INTERFACE
+
+        !> Overloaded TRANSPOSE routines (out-of-place form).
+        INTERFACE libxsmm_otrans
+          MODULE PROCEDURE libxsmm_otrans_p0
+          MODULE PROCEDURE libxsmm_otrans_d1
+          MODULE PROCEDURE libxsmm_otrans_d2
+          MODULE PROCEDURE libxsmm_otrans_s1
+          MODULE PROCEDURE libxsmm_otrans_s2
+        END INTERFACE
+
+        !> Calculate a hash value for a given key value (binary blob).
+        !> Conceptually pure, but C_LOC may be (incorrectly) impure.
+        INTERFACE libxsmm_hash
+          MODULE PROCEDURE libxsmm_hash_char
+          MODULE PROCEDURE libxsmm_hash_i8
+          MODULE PROCEDURE libxsmm_hash_i32
+          MODULE PROCEDURE libxsmm_hash_i64
+        END INTERFACE
+
+        !> Calculate whether there is a difference between two series of items.
+        !> Conceptually pure, but C_LOC may be (incorrectly) impure.
+        INTERFACE libxsmm_diff
+          MODULE PROCEDURE libxsmm_diff_char
+          MODULE PROCEDURE libxsmm_diff_i8
+          MODULE PROCEDURE libxsmm_diff_i32
+          MODULE PROCEDURE libxsmm_diff_i64
+        END INTERFACE
+
+      CONTAINS
+        !> Returns the name of the target architecture as determined by
+        !> the CPUID flags, as set by the libxsmm_get_target_arch* functions,
+        !> or as set by the LIBXSMM_TARGET environment variable.
+        FUNCTION libxsmm_get_target_arch()
+          !CHARACTER(LEN=:), POINTER :: libxsmm_get_target_arch
+          CHARACTER, POINTER :: libxsmm_get_target_arch(:)
+          INTEGER(C_INT) :: length(1)
+          TYPE(C_PTR) :: arch
+          INTERFACE
+            FUNCTION libxsmmf_get_target_arch(length) BIND(C)
+              IMPORT :: C_INT, C_PTR
+              INTEGER(C_INT), INTENT(OUT) :: length
+              TYPE(C_PTR) :: libxsmmf_get_target_arch
+            END FUNCTION
+          END INTERFACE
+          arch = libxsmmf_get_target_arch(length(1))
+          CALL C_F_POINTER(arch, libxsmm_get_target_arch, length)
+        END FUNCTION
+
+        !> Returns C_NULL_PTR.
+        PURE FUNCTION libxsmm_ptr_null()
+          TYPE(C_PTR) :: libxsmm_ptr_null
+          libxsmm_ptr_null = C_NULL_PTR
+        END FUNCTION
+
+        !> Determines the C-address of the given array.
+        FUNCTION libxsmm_ptr_z0(a)
+          COMPLEX(C_DOUBLE_COMPLEX), INTENT(IN), TARGET :: a
+          TYPE(C_PTR) :: libxsmm_ptr_z0
+          libxsmm_ptr_z0 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_z1(a)
+          COMPLEX(C_DOUBLE_COMPLEX), INTENT(IN), TARGET :: a(*)
+          TYPE(C_PTR) :: libxsmm_ptr_z1
+          libxsmm_ptr_z1 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_z2(a)
+          COMPLEX(C_DOUBLE_COMPLEX), INTENT(IN) :: a(:,:)
+          TYPE(C_PTR) :: libxsmm_ptr_z2
+          libxsmm_ptr_z2 = libxsmm_ptr_z1(a)
+        END FUNCTION
+
+        !> Determines the C-address of the given array.
+        FUNCTION libxsmm_ptr_c0(a)
+          COMPLEX(C_FLOAT_COMPLEX), INTENT(IN), TARGET :: a
+          TYPE(C_PTR) :: libxsmm_ptr_c0
+          libxsmm_ptr_c0 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_c1(a)
+          COMPLEX(C_FLOAT_COMPLEX), INTENT(IN), TARGET :: a(*)
+          TYPE(C_PTR) :: libxsmm_ptr_c1
+          libxsmm_ptr_c1 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_c2(a)
+          COMPLEX(C_FLOAT_COMPLEX), INTENT(IN) :: a(:,:)
+          TYPE(C_PTR) :: libxsmm_ptr_c2
+          libxsmm_ptr_c2 = libxsmm_ptr_c1(a)
+        END FUNCTION
+
+        !> Determines the C-address of the given array.
+        FUNCTION libxsmm_ptr_d0(a)
+          REAL(C_DOUBLE), INTENT(IN), TARGET :: a
+          TYPE(C_PTR) :: libxsmm_ptr_d0
+          libxsmm_ptr_d0 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_d1(a)
+          REAL(C_DOUBLE), INTENT(IN), TARGET :: a(*)
+          TYPE(C_PTR) :: libxsmm_ptr_d1
+          libxsmm_ptr_d1 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_d2(a)
+          REAL(C_DOUBLE), INTENT(IN) :: a(:,:)
+          TYPE(C_PTR) :: libxsmm_ptr_d2
+          libxsmm_ptr_d2 = libxsmm_ptr_d1(a)
+        END FUNCTION
+
+        !> Determines the C-address of the given array.
+        FUNCTION libxsmm_ptr_s0(a)
+          REAL(C_FLOAT), INTENT(IN), TARGET :: a
+          TYPE(C_PTR) :: libxsmm_ptr_s0
+          libxsmm_ptr_s0 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_s1(a)
+          REAL(C_FLOAT), INTENT(IN), TARGET :: a(*)
+          TYPE(C_PTR) :: libxsmm_ptr_s1
+          libxsmm_ptr_s1 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_s2(a)
+          REAL(C_FLOAT), INTENT(IN) :: a(:,:)
+          TYPE(C_PTR) :: libxsmm_ptr_s2
+          libxsmm_ptr_s2 = libxsmm_ptr_s1(a)
+        END FUNCTION
+
+        !> Determines the C-address of the given array.
+        FUNCTION libxsmm_ptr_i0(a)
+          INTEGER(C_INT), INTENT(IN), TARGET :: a
+          TYPE(C_PTR) :: libxsmm_ptr_i0
+          libxsmm_ptr_i0 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_i1(a)
+          INTEGER(C_INT), INTENT(IN), TARGET :: a(*)
+          TYPE(C_PTR) :: libxsmm_ptr_i1
+          libxsmm_ptr_i1 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_i2(a)
+          INTEGER(C_INT), INTENT(IN) :: a(:,:)
+          TYPE(C_PTR) :: libxsmm_ptr_i2
+          libxsmm_ptr_i2 = libxsmm_ptr_i1(a)
+        END FUNCTION
+
+        !> Determines the C-address of the given array.
+        FUNCTION libxsmm_ptr_w0(a)
+          INTEGER(C_SHORT), INTENT(IN), TARGET :: a
+          TYPE(C_PTR) :: libxsmm_ptr_w0
+          libxsmm_ptr_w0 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_w1(a)
+          INTEGER(C_SHORT), INTENT(IN), TARGET :: a(*)
+          TYPE(C_PTR) :: libxsmm_ptr_w1
+          libxsmm_ptr_w1 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_w2(a)
+          INTEGER(C_SHORT), INTENT(IN) :: a(:,:)
+          TYPE(C_PTR) :: libxsmm_ptr_w2
+          libxsmm_ptr_w2 = libxsmm_ptr_w1(a)
+        END FUNCTION
+
+        !> Determines the C-address of the given array.
+        FUNCTION libxsmm_ptr_j0(a)
+          INTEGER(C_INT8_T), INTENT(IN), TARGET :: a
+          TYPE(C_PTR) :: libxsmm_ptr_j0
+          libxsmm_ptr_j0 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_j1(a)
+          INTEGER(C_INT8_T), INTENT(IN), TARGET :: a(*)
+          TYPE(C_PTR) :: libxsmm_ptr_j1
+          libxsmm_ptr_j1 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_j2(a)
+          INTEGER(C_INT8_T), INTENT(IN) :: a(:,:)
+          TYPE(C_PTR) :: libxsmm_ptr_j2
+          libxsmm_ptr_j2 = libxsmm_ptr_j1(a)
+        END FUNCTION
+
+        !> Determines the C-address of the given array.
+        FUNCTION libxsmm_ptr_b0(a)
+          CHARACTER(C_CHAR), INTENT(IN), TARGET :: a
+          TYPE(C_PTR) :: libxsmm_ptr_b0
+          libxsmm_ptr_b0 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_b1(a)
+          CHARACTER(C_CHAR), INTENT(IN), TARGET :: a(*)
+          TYPE(C_PTR) :: libxsmm_ptr_b1
+          libxsmm_ptr_b1 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_b2(a)
+          CHARACTER(C_CHAR), INTENT(IN) :: a(:,:)
+          TYPE(C_PTR) :: libxsmm_ptr_b2
+          libxsmm_ptr_b2 = libxsmm_ptr_b1(a)
+        END FUNCTION
+
+        !> Determines the C-address of the given array.
+        FUNCTION libxsmm_ptr_l0(a)
+          INTEGER(C_LONG_LONG), INTENT(IN), TARGET :: a
+          TYPE(C_PTR) :: libxsmm_ptr_l0
+          libxsmm_ptr_l0 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_l1(a)
+          INTEGER(C_LONG_LONG), INTENT(IN), TARGET :: a(*)
+          TYPE(C_PTR) :: libxsmm_ptr_l1
+          libxsmm_ptr_l1 = C_LOC(a)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_l2(a)
+          INTEGER(C_LONG_LONG), INTENT(IN) :: a(:,:)
+          TYPE(C_PTR) :: libxsmm_ptr_l2
+          libxsmm_ptr_l2 = libxsmm_ptr_l1(a)
+        END FUNCTION
+
+        FUNCTION libxsmm_ptr_dmm(a)
+          TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN), TARGET :: a(:)
+          TYPE(LIBXSMM_DMMFUNCTION), POINTER :: p
+          TYPE(C_PTR) :: libxsmm_ptr_dmm
+          p => a(LBOUND(a,1)); libxsmm_ptr_dmm = C_LOC(p%handle)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_smm(a)
+          TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN), TARGET :: a(:)
+          TYPE(LIBXSMM_SMMFUNCTION), POINTER :: p
+          TYPE(C_PTR) :: libxsmm_ptr_smm
+          p => a(LBOUND(a,1)); libxsmm_ptr_smm = C_LOC(p%handle)
+        END FUNCTION
+        FUNCTION libxsmm_ptr_wimm(a)
+          TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN), TARGET :: a(:)
+          TYPE(LIBXSMM_WIMMFUNCTION), POINTER :: p
+          TYPE(C_PTR) :: libxsmm_ptr_wimm
+          p => a(LBOUND(a,1)); libxsmm_ptr_wimm = C_LOC(p%handle)
+        END FUNCTION
+
+        !> Deallocate JIT'ted code created by libxsmm_create routines. To
+        !> unregister code generated with libxsmm_dispatch is unnecessary.
+        SUBROUTINE libxsmm_release_dmmkernel(kernel)
+          TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN) :: kernel
+          CALL libxsmm_release_kernel(kernel%handle)
+        END SUBROUTINE
+
+        !> Deallocate JIT'ted code created by libxsmm_create routines. To
+        !> unregister code generated with libxsmm_dispatch is unnecessary.
+        SUBROUTINE libxsmm_release_smmkernel(kernel)
+          TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN) :: kernel
+          CALL libxsmm_release_kernel(kernel%handle)
+        END SUBROUTINE
+
+        !> Deallocate JIT'ted code created by libxsmm_create routines. To
+        !> unregister code generated with libxsmm_dispatch is unnecessary.
+        SUBROUTINE libxsmm_release_wimmkernel(kernel)
+          TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN) :: kernel
+          CALL libxsmm_release_kernel(kernel%handle)
+        END SUBROUTINE
+
+        !> Query or JIT-generate an SMM-kernel (double-precision).
+        SUBROUTINE libxsmm_dmmdispatch(kernel,                          &
+     &  m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch)
+          TYPE(LIBXSMM_DMMFUNCTION), INTENT(OUT) :: kernel
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN),                    &
+     &                                OPTIONAL, TARGET :: lda, ldb, ldc
+          REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: alpha, beta
+          INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: flags
+          INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: prefetch
+          CALL libxsmm_xmmdispatch(                                     &
+     &      kernel%handle, LIBXSMM_GEMM_PRECISION_F64,                  &
+     &      m, n, k, C_LOC(lda), C_LOC(ldb), C_LOC(ldc),                &
+     &      C_LOC(alpha), C_LOC(beta), C_LOC(flags), C_LOC(prefetch))
+        END SUBROUTINE
+
+        !> Query or JIT-generate an SMM-kernel (single-precision).
+        SUBROUTINE libxsmm_smmdispatch(kernel,                          &
+     &  m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch)
+          TYPE(LIBXSMM_SMMFUNCTION), INTENT(OUT) :: kernel
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN),                    &
+     &                                OPTIONAL, TARGET :: lda, ldb, ldc
+          REAL(C_FLOAT),  INTENT(IN), OPTIONAL, TARGET :: alpha, beta
+          INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: flags
+          INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: prefetch
+          CALL libxsmm_xmmdispatch(                                     &
+     &      kernel%handle, LIBXSMM_GEMM_PRECISION_F32,                  &
+     &      m, n, k, C_LOC(lda), C_LOC(ldb), C_LOC(ldc),                &
+     &      C_LOC(alpha), C_LOC(beta), C_LOC(flags), C_LOC(prefetch))
+        END SUBROUTINE
+
+        !> Query or JIT-generate an SMM-kernel (low-precision, int-accumulate).
+        SUBROUTINE libxsmm_wimmdispatch(kernel,                         &
+     &  m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch)
+          TYPE(LIBXSMM_WIMMFUNCTION), INTENT(OUT) :: kernel
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN),                    &
+     &                                OPTIONAL, TARGET :: lda, ldb, ldc
+          INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: alpha, beta
+          INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: flags
+          INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: prefetch
+          CALL libxsmm_xmmdispatch2(kernel%handle,                      &
+     &      LIBXSMM_GEMM_PRECISION_I16, LIBXSMM_GEMM_PRECISION_I32,     &
+     &      m, n, k, C_LOC(lda), C_LOC(ldb), C_LOC(ldc),                &
+     &      C_LOC(alpha), C_LOC(beta), C_LOC(flags), C_LOC(prefetch))
+        END SUBROUTINE
+
+        !> Checks if the given kernel was generated. JIT code is guaranteed
+        !> to be generated if JIT support was enabled at build-time of the
+        !> library (default). This overload belongs to libxsmm_(mm)available.
+        LOGICAL FUNCTION libxsmm_dmmavailable(kernel)
+          TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN) :: kernel
+          libxsmm_dmmavailable = C_ASSOCIATED(kernel%handle)
+        END FUNCTION
+
+        !> Checks if the given kernel was generated. JIT code is guaranteed
+        !> to be generated if JIT support was enabled at build-time of the
+        !> library (default). This overload belongs to libxsmm_(mm)available.
+        LOGICAL FUNCTION libxsmm_smmavailable(kernel)
+          TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN) :: kernel
+          libxsmm_smmavailable = C_ASSOCIATED(kernel%handle)
+        END FUNCTION
+
+        !> Checks if the given kernel was generated. JIT code is guaranteed
+        !> to be generated if JIT support was enabled at build-time of the
+        !> library (default). This overload belongs to libxsmm_(mm)available.
+        LOGICAL FUNCTION libxsmm_wimmavailable(kernel)
+          TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN) :: kernel
+          libxsmm_wimmavailable = C_ASSOCIATED(kernel%handle)
+        END FUNCTION
+
+        !> Calls the kernel with the given arguments. Alternatively,
+        !> PROCPOINTER can be used as shown by the inner comments
+        !> of this routine (LIBXSMM_FUNCTION3). The libxsmm_xmmcall
+        !> routines can be used in FORTRAN77.
+        SUBROUTINE libxsmm_dmmcall_abc(kernel, a, b, c)
+          TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN) :: kernel
+          REAL(C_DOUBLE), INTENT(IN),    TARGET :: a(*), b(*)
+          REAL(C_DOUBLE), INTENT(INOUT), TARGET :: c(*)
+          ! PROCEDURE(LIBXSMM_FUNCTION3), POINTER :: xmm
+          ! CALL C_F_PROCPOINTER(kernel%handle, xmm)
+          ! CALL xmm(...)
+          CALL libxsmm_xmmcall_abc(kernel%handle,                       &
+     &      C_LOC(a), C_LOC(b), C_LOC(c))
+        END SUBROUTINE
+
+        !> Calls the kernel with the given arguments. Alternatively,
+        !> PROCPOINTER can be used as shown by the inner comments
+        !> of this routine (LIBXSMM_FUNCTION6). The libxsmm_xmmcall
+        !> routines can be used in FORTRAN77.
+        SUBROUTINE libxsmm_dmmcall_prf(kernel, a, b, c, pa, pb, pc)
+          TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN) :: kernel
+          REAL(C_DOUBLE), INTENT(IN),    TARGET ::  a(*), b(*)
+          REAL(C_DOUBLE), INTENT(INOUT), TARGET ::  c(*)
+          REAL(C_DOUBLE), INTENT(IN),    TARGET :: pa(*)
+          REAL(C_DOUBLE), INTENT(IN),    TARGET :: pb(*)
+          REAL(C_DOUBLE), INTENT(IN),    TARGET :: pc(*)
+          ! PROCEDURE(LIBXSMM_FUNCTION6), POINTER :: xmm
+          ! CALL C_F_PROCPOINTER(kernel%handle, xmm)
+          ! CALL xmm(...)
+          CALL libxsmm_xmmcall_prf(kernel%handle,                       &
+     &      C_LOC(a),  C_LOC(b),  C_LOC(c),                             &
+     &      C_LOC(pa), C_LOC(pb), C_LOC(pc))
+        END SUBROUTINE
+
+        !> See also libxsmm_dmmcall_abc and libxsmm_dmmcall_prf.
+        SUBROUTINE libxsmm_dmmcall(kernel, a, b, c, pa, pb, pc)
+          TYPE(LIBXSMM_DMMFUNCTION),        INTENT(IN) :: kernel
+          REAL(C_DOUBLE), INTENT(IN),           TARGET ::  a(*), b(*)
+          REAL(C_DOUBLE), INTENT(INOUT),        TARGET ::  c(*)
+          REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: pa(*)
+          REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: pb(*)
+          REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: pc(*)
+          ! use .OR. instead of .AND. to avoid full check
+          IF (PRESENT(pa).OR.PRESENT(pb).OR.PRESENT(pc)) THEN
+            CALL libxsmm_xmmcall_prf(kernel%handle,                     &
+     &        C_LOC(a),  C_LOC(b),  C_LOC(c),                           &
+     &        C_LOC(pa), C_LOC(pb), C_LOC(pc))
+          ELSE
+            CALL libxsmm_xmmcall_abc(kernel%handle,                     &
+     &        C_LOC(a), C_LOC(b), C_LOC(c))
+          END IF
+        END SUBROUTINE
+
+        !> Calls the kernel with the given arguments. Alternatively,
+        !> PROCPOINTER can be used as shown by the inner comments
+        !> of this routine (LIBXSMM_FUNCTION3). The libxsmm_xmmcall
+        !> routines can be used in FORTRAN77.
+        SUBROUTINE libxsmm_smmcall_abc(kernel, a, b, c)
+          TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN) :: kernel
+          REAL(C_FLOAT), INTENT(IN),     TARGET :: a(*), b(*)
+          REAL(C_FLOAT), INTENT(INOUT),  TARGET :: c(*)
+          ! PROCEDURE(LIBXSMM_FUNCTION3), POINTER :: xmm
+          ! CALL C_F_PROCPOINTER(kernel%handle, xmm)
+          ! CALL xmm(...)
+          CALL libxsmm_xmmcall_abc(kernel%handle,                       &
+     &      C_LOC(a), C_LOC(b), C_LOC(c))
+        END SUBROUTINE
+
+        !> Calls the kernel with the given arguments. Alternatively,
+        !> PROCPOINTER can be used as shown by the inner comments
+        !> of this routine (LIBXSMM_FUNCTION6). The libxsmm_xmmcall
+        !> routines can be used in FORTRAN77.
+        SUBROUTINE libxsmm_smmcall_prf(kernel, a, b, c, pa, pb, pc)
+          TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN) :: kernel
+          REAL(C_FLOAT), INTENT(IN),     TARGET ::  a(*), b(*)
+          REAL(C_FLOAT), INTENT(INOUT),  TARGET ::  c(*)
+          REAL(C_FLOAT), INTENT(IN),     TARGET :: pa(*)
+          REAL(C_FLOAT), INTENT(IN),     TARGET :: pb(*)
+          REAL(C_FLOAT), INTENT(IN),     TARGET :: pc(*)
+          ! PROCEDURE(LIBXSMM_FUNCTION6), POINTER :: xmm
+          ! CALL C_F_PROCPOINTER(kernel%handle, xmm)
+          ! CALL xmm(...)
+          CALL libxsmm_xmmcall_prf(kernel%handle,                       &
+     &      C_LOC(a),  C_LOC(b),  C_LOC(c),                             &
+     &      C_LOC(pa), C_LOC(pb), C_LOC(pc))
+        END SUBROUTINE
+
+        !> See also libxsmm_smmcall_abc and libxsmm_smmcall_prf.
+        SUBROUTINE libxsmm_smmcall(kernel, a, b, c, pa, pb, pc)
+          TYPE(LIBXSMM_SMMFUNCTION),       INTENT(IN) :: kernel
+          REAL(C_FLOAT), INTENT(IN),           TARGET ::  a(*), b(*)
+          REAL(C_FLOAT), INTENT(INOUT),        TARGET ::  c(*)
+          REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: pa(*)
+          REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: pb(*)
+          REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: pc(*)
+          ! use .OR. instead of .AND. to avoid full check
+          IF (PRESENT(pa).OR.PRESENT(pb).OR.PRESENT(pc)) THEN
+            CALL libxsmm_xmmcall_prf(kernel%handle,                     &
+     &        C_LOC(a),  C_LOC(b),  C_LOC(c),                           &
+     &        C_LOC(pa), C_LOC(pb), C_LOC(pc))
+          ELSE
+            CALL libxsmm_xmmcall_abc(kernel%handle,                     &
+     &        C_LOC(a), C_LOC(b), C_LOC(c))
+          END IF
+        END SUBROUTINE
+
+        !> Calls the kernel with the given arguments. Alternatively,
+        !> PROCPOINTER can be used as shown by the inner comments
+        !> of this routine (LIBXSMM_FUNCTION3). The libxsmm_xmmcall
+        !> routines can be used in FORTRAN77.
+        SUBROUTINE libxsmm_wimmcall_abc(kernel, a, b, c)
+          TYPE(LIBXSMM_WIMMFUNCTION),  INTENT(IN) :: kernel
+          INTEGER(C_SHORT), INTENT(IN),    TARGET :: a(*), b(*)
+          INTEGER(C_INT),   INTENT(INOUT), TARGET :: c(*)
+          ! PROCEDURE(LIBXSMM_FUNCTION3), POINTER :: xmm
+          ! CALL C_F_PROCPOINTER(kernel%handle, xmm)
+          ! CALL xmm(...)
+          CALL libxsmm_xmmcall_abc(kernel%handle,                       &
+     &      C_LOC(a), C_LOC(b), C_LOC(c))
+        END SUBROUTINE
+
+        !> Calls the kernel with the given arguments. Alternatively,
+        !> PROCPOINTER can be used as shown by the inner comments
+        !> of this routine (LIBXSMM_FUNCTION6). The libxsmm_xmmcall
+        !> routines can be used in FORTRAN77.
+        SUBROUTINE libxsmm_wimmcall_prf(kernel, a, b, c, pa, pb, pc)
+          TYPE(LIBXSMM_WIMMFUNCTION),  INTENT(IN) :: kernel
+          INTEGER(C_SHORT), INTENT(IN),    TARGET ::  a(*), b(*)
+          INTEGER(C_INT),   INTENT(INOUT), TARGET ::  c(*)
+          INTEGER(C_SHORT), INTENT(IN),    TARGET :: pa(*)
+          INTEGER(C_SHORT), INTENT(IN),    TARGET :: pb(*)
+          INTEGER(C_SHORT), INTENT(IN),    TARGET :: pc(*)
+          ! PROCEDURE(LIBXSMM_FUNCTION6), POINTER :: xmm
+          ! CALL C_F_PROCPOINTER(kernel%handle, xmm)
+          ! CALL xmm(...)
+          CALL libxsmm_xmmcall_prf(kernel%handle,                       &
+     &      C_LOC(a),  C_LOC(b),  C_LOC(c),                             &
+     &      C_LOC(pa), C_LOC(pb), C_LOC(pc))
+        END SUBROUTINE
+
+        !> See also libxsmm_wimmcall_abc and libxsmm_wimmcall_prf.
+        SUBROUTINE libxsmm_wimmcall(kernel, a, b, c, pa, pb, pc)
+          TYPE(LIBXSMM_WIMMFUNCTION),         INTENT(IN) :: kernel
+          INTEGER(C_SHORT), INTENT(IN),           TARGET ::  a(*), b(*)
+          INTEGER(C_INT), INTENT(INOUT),          TARGET ::  c(*)
+          INTEGER(C_SHORT), INTENT(IN), OPTIONAL, TARGET :: pa(*)
+          INTEGER(C_SHORT), INTENT(IN), OPTIONAL, TARGET :: pb(*)
+          INTEGER(C_SHORT), INTENT(IN), OPTIONAL, TARGET :: pc(*)
+          ! use .OR. instead of .AND. to avoid full check
+          IF (PRESENT(pa).OR.PRESENT(pb).OR.PRESENT(pc)) THEN
+            CALL libxsmm_xmmcall_prf(kernel%handle,                     &
+     &        C_LOC(a),  C_LOC(b),  C_LOC(c),                           &
+     &        C_LOC(pa), C_LOC(pb), C_LOC(pc))
+          ELSE
+            CALL libxsmm_xmmcall_abc(kernel%handle,                     &
+     &        C_LOC(a), C_LOC(b), C_LOC(c))
+          END IF
+        END SUBROUTINE
+
+        !> Register user-defined key-value; value can be queried (libxsmm_xdispatch).
+        !> Since the key-type is unknown to LIBXSMM, the key must be binary reproducible,
+        !> i.e., if it is a structured type (padded data may be uninitialized), it must
+        !> be initially zero-filled (libxsmm_xclear) followed by an element-wise setup.
+        !> The size of the key is limited (see documentation). The given value is copied
+        !> by LIBXSMM and may be initialized at registration-time or whenever queried.
+        !> Registered data is released at program termination but can be also released
+        !> if needed (libxsmm_xrelease), .e.g., for larger value for the same key.
+        FUNCTION libxsmm_xregister(key, keysize, valsize,               &
+     &  valinit, keyhash)
+          TYPE(C_PTR),    INTENT(IN), VALUE     :: key
+          INTEGER(C_INT), INTENT(IN)            :: keysize, valsize
+          TYPE(C_PTR),    INTENT(IN),  OPTIONAL :: valinit
+          INTEGER(C_INT), INTENT(OUT), OPTIONAL :: keyhash
+          TYPE(C_PTR) :: libxsmm_xregister
+          INTERFACE
+            SUBROUTINE internal_xregister(regval,                       &
+     &      key, keysize, valsize, valinit, keyhash)                    &
+     &      BIND(C, NAME="libxsmm_xregister_")
+              IMPORT :: C_PTR, C_INT
+              TYPE(C_PTR), INTENT(OUT) :: regval
+              TYPE(C_PTR), INTENT(IN), VALUE :: key, valinit
+              INTEGER(C_INT), INTENT(IN)  :: keysize, valsize
+              INTEGER(C_INT), INTENT(OUT) :: keyhash
+            END SUBROUTINE
+          END INTERFACE
+          CALL internal_xregister(libxsmm_xregister,                    &
+     &      key, keysize, valsize, valinit, keyhash)
+        END FUNCTION
+
+        !> Query user-defined value from LIBXSMM's code registry.
+        FUNCTION libxsmm_xdispatch(key, keysize, keyhash)
+          TYPE(C_PTR), INTENT(IN), VALUE :: key
+          INTEGER(C_INT), INTENT(IN) :: keysize
+          INTEGER(C_INT), INTENT(OUT), OPTIONAL :: keyhash
+          TYPE(C_PTR) :: libxsmm_xdispatch
+          INTERFACE
+            SUBROUTINE internal_xdispatch(regval, key, keysize, keyhash)&
+     &      BIND(C, NAME="libxsmm_xdispatch_")
+              IMPORT :: C_PTR, C_INT
+              TYPE(C_PTR), INTENT(OUT) :: regval
+              TYPE(C_PTR), INTENT(IN), VALUE :: key
+              INTEGER(C_INT), INTENT(IN)  :: keysize
+              INTEGER(C_INT), INTENT(OUT) :: keyhash
+            END SUBROUTINE
+          END INTERFACE
+          CALL internal_xdispatch(libxsmm_xdispatch,                    &
+     &      key, keysize, keyhash)
+        END FUNCTION
+
+        !> Auto-dispatched general dense MM (double-precision).
+        !> This overload belongs to libxsmm_(d)gemm.
+        PURE SUBROUTINE libxsmm_dgemm0(transa, transb, m, n, k,         &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc
+          REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_DOUBLE), INTENT(IN) :: a, b
+          REAL(C_DOUBLE), INTENT(INOUT) :: c
+          INTERFACE
+            PURE SUBROUTINE internal_gemm(transa, transb, m, n, k,      &
+     &      alpha, a, lda, b, ldb, beta, c, ldc)                        &
+     &      BIND(C, NAME="libxsmm_dgemm_")
+              IMPORT :: C_CHAR, C_DOUBLE, LIBXSMM_BLASINT_KIND
+              CHARACTER(C_CHAR), INTENT(IN) :: transa, transb
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc
+              REAL(C_DOUBLE), INTENT(IN) :: alpha, beta
+              REAL(C_DOUBLE), INTENT(IN) :: a, b
+              REAL(C_DOUBLE), INTENT(INOUT) :: c
+            END SUBROUTINE
+          END INTERFACE
+          CALL internal_gemm(transa, transb, m, n, k,                   &
+     &      alpha, a, lda, b, ldb, beta, c, ldc)
+        END SUBROUTINE
+
+        !> Auto-dispatched general dense MM (double-precision).
+        !> This overload belongs to libxsmm_(d)gemm.
+        PURE SUBROUTINE libxsmm_dgemm1(transa, transb, m, n, k,         &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc
+          REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_DOUBLE), INTENT(IN)    :: a(*), b(*)
+          REAL(C_DOUBLE), INTENT(INOUT) :: c(*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_dgemm0(transa, transb, m, n, k,                &
+     &        alpha, a(LBOUND(a,1)), lda,                               &
+     &               b(LBOUND(b,1)), ldb,                               &
+     &         beta, c(LBOUND(c,1)), ldc)
+          END IF
+        END SUBROUTINE
+
+        !> Auto-dispatched general dense MM (double-precision).
+        !> This overload belongs to libxsmm_(d)gemm.
+        PURE SUBROUTINE libxsmm_dgemm2(transa, transb, m, n, k,         &
+     &  a, b, c, alpha, beta)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_DOUBLE), INTENT(IN)    :: a(m,*), b(k,*)
+          REAL(C_DOUBLE), INTENT(INOUT) :: c(m,*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_dgemm0(transa, transb, m, n, k,                &
+     &        alpha, a(LBOUND(a,1),LBOUND(a,2)), m,                     &
+     &               b(LBOUND(b,1),LBOUND(b,2)), k,                     &
+     &         beta, c(LBOUND(c,1),LBOUND(c,2)), m)
+          END IF
+        END SUBROUTINE
+
+        !> Auto-dispatched general dense MM (double-precision).
+        !> This overload belongs to libxsmm_(d)gemm.
+        PURE SUBROUTINE libxsmm_dgemm3(transa, transb, m, n, k,         &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc
+          REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_DOUBLE), INTENT(IN)    :: a(lda,*), b(ldb,*)
+          REAL(C_DOUBLE), INTENT(INOUT) :: c(ldc,*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_dgemm0(transa, transb, m, n, k,                &
+     &        alpha, a(LBOUND(a,1),LBOUND(a,2)), lda,                   &
+     &               b(LBOUND(b,1),LBOUND(b,2)), ldb,                   &
+     &         beta, c(LBOUND(c,1),LBOUND(c,2)), ldc)
+          END IF
+        END SUBROUTINE
+
+        !> Auto-dispatched general dense MM (single-precision).
+        !> This overload belongs to libxsmm_(s)gemm.
+        PURE SUBROUTINE libxsmm_sgemm0(transa, transb, m, n, k,         &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc
+          REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_FLOAT), INTENT(IN)    :: a, b
+          REAL(C_FLOAT), INTENT(INOUT) :: c
+          INTERFACE
+            PURE SUBROUTINE internal_gemm(transa, transb, m, n, k,      &
+     &      alpha, a, lda, b, ldb, beta, c, ldc)                        &
+     &      BIND(C, NAME="libxsmm_sgemm_")
+              IMPORT :: C_CHAR, C_FLOAT, LIBXSMM_BLASINT_KIND
+              CHARACTER(C_CHAR), INTENT(IN) :: transa, transb
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc
+              REAL(C_FLOAT), INTENT(IN) :: alpha, beta
+              REAL(C_FLOAT), INTENT(IN) :: a, b
+              REAL(C_FLOAT), INTENT(INOUT) :: c
+            END SUBROUTINE
+          END INTERFACE
+          CALL internal_gemm(transa, transb, m, n, k,                   &
+     &      alpha, a, lda, b, ldb, beta, c, ldc)
+        END SUBROUTINE
+
+        !> Auto-dispatched general dense MM (single-precision).
+        !> This overload belongs to libxsmm_(s)gemm.
+        PURE SUBROUTINE libxsmm_sgemm1(transa, transb, m, n, k,         &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc
+          REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_FLOAT), INTENT(IN)    :: a(*), b(*)
+          REAL(C_FLOAT), INTENT(INOUT) :: c(*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_sgemm0(transa, transb, m, n, k,                &
+     &        alpha, a(LBOUND(a,1)), lda,                               &
+     &               b(LBOUND(b,1)), ldb,                               &
+     &         beta, c(LBOUND(c,1)), ldc)
+          END IF
+        END SUBROUTINE
+
+        !> Auto-dispatched general dense MM (single-precision).
+        !> This overload belongs to libxsmm_(s)gemm.
+        PURE SUBROUTINE libxsmm_sgemm2(transa, transb, m, n, k,         &
+     &  a, b, c, alpha, beta)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_FLOAT), INTENT(IN)    :: a(m,*), b(k,*)
+          REAL(C_FLOAT), INTENT(INOUT) :: c(m,*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_sgemm0(transa, transb, m, n, k,                &
+     &        alpha, a(LBOUND(a,1),LBOUND(a,2)), m,                     &
+     &               b(LBOUND(b,1),LBOUND(b,2)), k,                     &
+     &         beta, c(LBOUND(c,1),LBOUND(c,2)), m)
+          END IF
+        END SUBROUTINE
+
+        !> Auto-dispatched general dense MM (single-precision).
+        !> This overload belongs to libxsmm_(s)gemm.
+        PURE SUBROUTINE libxsmm_sgemm3(transa, transb, m, n, k,         &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc
+          REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_FLOAT), INTENT(IN)    :: a(lda,*), b(ldb,*)
+          REAL(C_FLOAT), INTENT(INOUT) :: c(ldc,*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_sgemm0(transa, transb, m, n, k,                &
+     &        alpha, a(LBOUND(a,1),LBOUND(a,2)), lda,                   &
+     &               b(LBOUND(b,1),LBOUND(b,2)), ldb,                   &
+     &         beta, c(LBOUND(c,1),LBOUND(c,2)), ldc)
+          END IF
+        END SUBROUTINE
+
+        !> Auto-dispatched general dense MM (low-precision, int-accumulate).
+        !> This overload belongs to libxsmm_(wi)gemm.
+        PURE SUBROUTINE libxsmm_wigemm0(transa, transb, m, n, k,        &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc
+          INTEGER(C_INT), INTENT(IN), OPTIONAL :: alpha, beta
+          INTEGER(C_SHORT), INTENT(IN)  :: a, b
+          INTEGER(C_INT), INTENT(INOUT) :: c
+          INTERFACE
+            PURE SUBROUTINE internal_gemm(transa, transb, m, n, k,      &
+     &      alpha, a, lda, b, ldb, beta, c, ldc)                        &
+     &      BIND(C, NAME="libxsmm_wigemm_")
+              IMPORT :: C_CHAR, C_SHORT, C_INT, LIBXSMM_BLASINT_KIND
+              CHARACTER(C_CHAR), INTENT(IN) :: transa, transb
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc
+              INTEGER(C_INT),   INTENT(IN)    :: alpha, beta
+              INTEGER(C_SHORT), INTENT(IN)    :: a, b
+              INTEGER(C_INT),   INTENT(INOUT) :: c
+            END SUBROUTINE
+          END INTERFACE
+          CALL internal_gemm(transa, transb, m, n, k,                   &
+     &      alpha, a, lda, b, ldb, beta, c, ldc)
+        END SUBROUTINE
+
+        !> Auto-dispatched general dense MM (low-precision, int-accumulate).
+        !> This overload belongs to libxsmm_(wi)gemm.
+        PURE SUBROUTINE libxsmm_wigemm1(transa, transb, m, n, k,        &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc
+          INTEGER(C_INT), INTENT(IN), OPTIONAL :: alpha, beta
+          INTEGER(C_SHORT), INTENT(IN)    :: a(*), b(*)
+          INTEGER(C_INT),   INTENT(INOUT) :: c(*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_wigemm0(transa, transb, m, n, k,               &
+     &        alpha, a(LBOUND(a,1)), lda,                               &
+     &               b(LBOUND(b,1)), ldb,                               &
+     &         beta, c(LBOUND(c,1)), ldc)
+          END IF
+        END SUBROUTINE
+
+        !> Auto-dispatched general dense MM (low-precision, int-accumulate).
+        !> This overload belongs to libxsmm_(wi)gemm.
+        PURE SUBROUTINE libxsmm_wigemm2(transa, transb, m, n, k,        &
+     &  a, b, c, alpha, beta)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(C_INT), INTENT(IN), OPTIONAL :: alpha, beta
+          INTEGER(C_SHORT), INTENT(IN)  :: a(m,*), b(k,*)
+          INTEGER(C_INT), INTENT(INOUT) :: c(m,*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_wigemm0(transa, transb, m, n, k,               &
+     &        alpha, a(LBOUND(a,1),LBOUND(a,2)), m,                     &
+     &               b(LBOUND(b,1),LBOUND(b,2)), k,                     &
+     &         beta, c(LBOUND(c,1),LBOUND(c,2)), m)
+          END IF
+        END SUBROUTINE
+
+        !> Auto-dispatched general dense MM (low-precision, int-accumulate).
+        !> This overload belongs to libxsmm_(wi)gemm.
+        PURE SUBROUTINE libxsmm_wigemm3(transa, transb, m, n, k,        &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc
+          INTEGER(C_INT), INTENT(IN), OPTIONAL :: alpha, beta
+          INTEGER(C_SHORT), INTENT(IN)  :: a(lda,*), b(ldb,*)
+          INTEGER(C_INT), INTENT(INOUT) :: c(ldc,*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_wigemm0(transa, transb, m, n, k,               &
+     &        alpha, a(LBOUND(a,1),LBOUND(a,2)), lda,                   &
+     &               b(LBOUND(b,1),LBOUND(b,2)), ldb,                   &
+     &         beta, c(LBOUND(c,1),LBOUND(c,2)), ldc)
+          END IF
+        END SUBROUTINE
+
+        !> Re-exposes BLAS based GEMM routine with an interfaces similar to
+        !> libxsmm_(d)gemm. This overload belongs to libxsmm_blas_(d)gemm.
+        PURE SUBROUTINE libxsmm_blas_dgemm0(transa, transb, m, n, k,    &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc
+          REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_DOUBLE), INTENT(IN)    :: a, b
+          REAL(C_DOUBLE), INTENT(INOUT) :: c
+          INTERFACE
+            PURE SUBROUTINE internal_gemm(transa, transb, m, n, k,      &
+     &      alpha, a, lda, b, ldb, beta, c, ldc)                        &
+     &      BIND(C, NAME="libxsmm_blas_dgemm_")
+              IMPORT :: C_CHAR, C_DOUBLE, LIBXSMM_BLASINT_KIND
+              CHARACTER(C_CHAR), INTENT(IN) :: transa, transb
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc
+              REAL(C_DOUBLE), INTENT(IN)    :: alpha, beta
+              REAL(C_DOUBLE), INTENT(IN)    :: a, b
+              REAL(C_DOUBLE), INTENT(INOUT) :: c
+            END SUBROUTINE
+          END INTERFACE
+          CALL internal_gemm(transa, transb, m, n, k,                   &
+     &      alpha, a, lda, b, ldb, beta, c, ldc)
+        END SUBROUTINE
+
+        !> Re-exposes BLAS based GEMM routine with an interfaces similar to
+        !> libxsmm_(d)gemm. This overload belongs to libxsmm_blas_(d)gemm.
+        PURE SUBROUTINE libxsmm_blas_dgemm1(transa, transb, m, n, k,    &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc
+          REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_DOUBLE), INTENT(IN)    :: a(*), b(*)
+          REAL(C_DOUBLE), INTENT(INOUT) :: c(*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_blas_dgemm0(transa, transb, m, n, k,           &
+     &        alpha, a(LBOUND(a,1)), lda,                               &
+     &               b(LBOUND(b,1)), ldb,                               &
+     &         beta, c(LBOUND(c,1)), ldc)
+          END IF
+        END SUBROUTINE
+
+        !> Re-exposes BLAS based GEMM routine with an interfaces similar to
+        !> libxsmm_(d)gemm. This overload belongs to libxsmm_blas_(d)gemm.
+        PURE SUBROUTINE libxsmm_blas_dgemm2(transa, transb, m, n, k,    &
+     &  a, b, c, alpha, beta)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_DOUBLE), INTENT(IN)    :: a(m,*), b(k,*)
+          REAL(C_DOUBLE), INTENT(INOUT) :: c(m,*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_blas_dgemm0(transa, transb, m, n, k,           &
+     &        alpha, a(LBOUND(a,1),LBOUND(a,2)), m,                     &
+     &               b(LBOUND(b,1),LBOUND(b,2)), k,                     &
+     &         beta, c(LBOUND(c,1),LBOUND(c,2)), m)
+          END IF
+        END SUBROUTINE
+
+        !> Re-exposes BLAS based GEMM routine with an interfaces similar to
+        !> libxsmm_(d)gemm. This overload belongs to libxsmm_blas_(d)gemm.
+        PURE SUBROUTINE libxsmm_blas_dgemm3(transa, transb, m, n, k,    &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc
+          REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_DOUBLE), INTENT(IN)    :: a(lda,*), b(ldb,*)
+          REAL(C_DOUBLE), INTENT(INOUT) :: c(ldc,*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_blas_dgemm0(transa, transb, m, n, k,           &
+     &        alpha, a(LBOUND(a,1),LBOUND(a,2)), lda,                   &
+     &               b(LBOUND(b,1),LBOUND(b,2)), ldb,                   &
+     &         beta, c(LBOUND(c,1),LBOUND(c,2)), ldc)
+          END IF
+        END SUBROUTINE
+
+        !> Re-exposes BLAS based GEMM routine with an interfaces similar to
+        !> libxsmm_(s)gemm. This overload belongs to libxsmm_blas_(s)gemm.
+        PURE SUBROUTINE libxsmm_blas_sgemm0(transa, transb, m, n, k,    &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc
+          REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_FLOAT), INTENT(IN)    :: a, b
+          REAL(C_FLOAT), INTENT(INOUT) :: c
+          INTERFACE
+            PURE SUBROUTINE internal_gemm(transa, transb, m, n, k,      &
+     &      alpha, a, lda, b, ldb, beta, c, ldc)                        &
+     &      BIND(C, NAME="libxsmm_blas_sgemm_")
+              IMPORT :: C_CHAR, C_FLOAT, LIBXSMM_BLASINT_KIND
+              CHARACTER(C_CHAR), INTENT(IN) :: transa, transb
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc
+              REAL(C_FLOAT), INTENT(IN)    :: alpha, beta
+              REAL(C_FLOAT), INTENT(IN)    :: a, b
+              REAL(C_FLOAT), INTENT(INOUT) :: c
+            END SUBROUTINE
+          END INTERFACE
+          CALL internal_gemm(transa, transb, m, n, k,                   &
+     &      alpha, a, lda, b, ldb, beta, c, ldc)
+        END SUBROUTINE
+
+        !> Re-exposes BLAS based GEMM routine with an interfaces similar to
+        !> libxsmm_(s)gemm. This overload belongs to libxsmm_blas_(s)gemm.
+        PURE SUBROUTINE libxsmm_blas_sgemm1(transa, transb, m, n, k,    &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc
+          REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_FLOAT), INTENT(IN)    :: a(*), b(*)
+          REAL(C_FLOAT), INTENT(INOUT) :: c(*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_blas_sgemm0(transa, transb, m, n, k,           &
+     &        alpha, a(LBOUND(a,1)), lda,                               &
+     &               b(LBOUND(b,1)), ldb,                               &
+     &         beta, c(LBOUND(c,1)), ldc)
+          END IF
+        END SUBROUTINE
+
+        !> Re-exposes BLAS based GEMM routine with an interfaces similar to
+        !> libxsmm_(s)gemm. This overload belongs to libxsmm_blas_(s)gemm.
+        PURE SUBROUTINE libxsmm_blas_sgemm2(transa, transb, m, n, k,    &
+     &  a, b, c, alpha, beta)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_FLOAT), INTENT(IN)    :: a(m,*), b(k,*)
+          REAL(C_FLOAT), INTENT(INOUT) :: c(m,*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_blas_sgemm0(transa, transb, m, n, k,           &
+     &        alpha, a(LBOUND(a,1),LBOUND(a,2)), m,                     &
+     &               b(LBOUND(b,1),LBOUND(b,2)), k,                     &
+     &         beta, c(LBOUND(c,1),LBOUND(c,2)), m)
+          END IF
+        END SUBROUTINE
+
+        !> Re-exposes BLAS based GEMM routine with an interfaces similar to
+        !> libxsmm_(s)gemm. This overload belongs to libxsmm_blas_(s)gemm.
+        PURE SUBROUTINE libxsmm_blas_sgemm3(transa, transb, m, n, k,    &
+     &  alpha, a, lda, b, ldb, beta, c, ldc)
+          CHARACTER, INTENT(IN), OPTIONAL :: transa, transb
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc
+          REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta
+          REAL(C_FLOAT), INTENT(IN)    :: a(lda,*), b(ldb,*)
+          REAL(C_FLOAT), INTENT(INOUT) :: c(ldc,*)
+          IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN
+            CALL libxsmm_blas_sgemm0(transa, transb, m, n, k,           &
+     &        alpha, a(LBOUND(a,1),LBOUND(a,2)), lda,                   &
+     &               b(LBOUND(b,1),LBOUND(b,2)), ldb,                   &
+     &         beta, c(LBOUND(c,1),LBOUND(c,2)), ldc)
+          END IF
+        END SUBROUTINE
+
+        !> Matrix-copy (2-dimensional copy) routine. If the input (optional)
+        !> is not present, the routine is used to zero-fill the out-matrix.
+        PURE SUBROUTINE libxsmm_matcopy_p0(output, input, typesize,     &
+     &  m, n, ldi, ldo)
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN),                    &
+     &                                OPTIONAL :: n, ldi, ldo
+          INTEGER(C_INT), INTENT(IN) :: typesize
+          TYPE(C_PTR), INTENT(IN), OPTIONAL :: input
+          TYPE(C_PTR), INTENT(IN) :: output
+          CALL libxsmm_xmatcopy(output, input, typesize,                &
+     &      m, n, ldi, ldo)
+        END SUBROUTINE
+
+        !> Matrix-copy (2-dimensional copy) routine (DP/rank-1).
+        SUBROUTINE libxsmm_matcopy_d1(output, input, m, n, ldi, ldo)
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo
+          REAL(C_DOUBLE), INTENT(OUT),          TARGET :: output(*)
+          REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET ::  input(*)
+          CALL libxsmm_xmatcopy(C_LOC(output), C_LOC(input), 8,         &
+     &      m, n, ldi, ldo)
+        END SUBROUTINE
+
+        !> Matrix-copy (2-dimensional copy) routine (DP/rank-2).
+        SUBROUTINE libxsmm_matcopy_d2(output, input, m, n, ldi, ldo)
+          INTEGER(LIBXSMM_BLASINT_KIND),    INTENT(IN) :: m, n, ldi, ldo
+          REAL(C_DOUBLE), INTENT(OUT),          TARGET :: output(ldo,*)
+          REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET ::  input(ldi,*)
+          CALL libxsmm_xmatcopy(C_LOC(output), C_LOC(input), 8,         &
+     &      m, n, ldi, ldo)
+        END SUBROUTINE
+
+        !> Matrix-copy (2-dimensional copy) routine (SP/rank-1).
+        SUBROUTINE libxsmm_matcopy_s1(output, input, m, n, ldi, ldo)
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo
+          REAL(C_FLOAT),  INTENT(OUT),          TARGET :: output(*)
+          REAL(C_FLOAT),  INTENT(IN), OPTIONAL, TARGET ::  input(*)
+          CALL libxsmm_xmatcopy(C_LOC(output), C_LOC(input), 4,         &
+     &      m, n, ldi, ldo)
+        END SUBROUTINE
+
+        !> Matrix-copy (2-dimensional copy) routine (SP/rank-2).
+        SUBROUTINE libxsmm_matcopy_s2(output, input, m, n, ldi, ldo)
+          INTEGER(LIBXSMM_BLASINT_KIND),    INTENT(IN) :: m, n, ldi, ldo
+          REAL(C_FLOAT),  INTENT(OUT),          TARGET :: output(ldo,*)
+          REAL(C_FLOAT),  INTENT(IN), OPTIONAL, TARGET ::  input(ldi,*)
+          CALL libxsmm_xmatcopy(C_LOC(output), C_LOC(input), 4,         &
+     &      m, n, ldi, ldo)
+        END SUBROUTINE
+
+        !> Transpose a matrix (in-place form).
+        PURE SUBROUTINE libxsmm_itrans_p0(matrix, typesize,             &
+     &  m, n, ldi, ldo)
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo
+          TYPE(C_PTR),    INTENT(IN) :: matrix
+          INTEGER(C_INT), INTENT(IN) :: typesize
+          CALL libxsmm_xitrans(matrix, typesize, m, n, ldi, ldo)
+        END SUBROUTINE
+
+        !> Transpose a matrix (in-place form, DP/rank-1).
+        SUBROUTINE libxsmm_itrans_d1(matrix, m, n, ldi, ldo)
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo
+          REAL(C_DOUBLE), INTENT(INOUT), TARGET :: matrix(*)
+          CALL libxsmm_xitrans(C_LOC(matrix), 8, m, n, ldi, ldo)
+        END SUBROUTINE
+
+        !> Transpose a matrix (in-place form, DP/rank-2).
+        SUBROUTINE libxsmm_itrans_d2(matrix, m, n, ld)
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ld
+          REAL(C_DOUBLE), INTENT(INOUT), TARGET :: matrix(ld,*)
+          CALL libxsmm_xitrans(C_LOC(matrix), 8, m, n, ld, ld)
+        END SUBROUTINE
+
+        !> Transpose a matrix (in-place form, SP/rank-1).
+        SUBROUTINE libxsmm_itrans_s1(matrix, m, n, ldi, ldo)
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo
+          REAL(C_FLOAT), INTENT(INOUT), TARGET :: matrix(*)
+          CALL libxsmm_xitrans(C_LOC(matrix), 4, m, n, ldi, ldo)
+        END SUBROUTINE
+
+        !> Transpose a matrix (in-place form, SP/rank-2).
+        SUBROUTINE libxsmm_itrans_s2(matrix, m, n, ld)
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ld
+          REAL(C_FLOAT), INTENT(INOUT), TARGET :: matrix(ld,*)
+          CALL libxsmm_xitrans(C_LOC(matrix), 4, m, n, ld, ld)
+        END SUBROUTINE
+
+        !> Transpose a matrix (out-of-place form).
+        PURE SUBROUTINE libxsmm_otrans_p0(output, input, typesize,      &
+     &  m, n, ldi, ldo)
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo
+          TYPE(C_PTR),    INTENT(IN) :: output, input
+          INTEGER(C_INT), INTENT(IN) :: typesize
+          CALL libxsmm_xotrans(output, input, typesize, m, n, ldi, ldo)
+        END SUBROUTINE
+
+        !> Transpose a matrix (out-of-place form, DP/rank-1).
+        SUBROUTINE libxsmm_otrans_d1(output, input, m, n, ldi, ldo)
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo
+          REAL(C_DOUBLE), INTENT(OUT), TARGET :: output(*)
+          REAL(C_DOUBLE), INTENT(IN),  TARGET ::  input(*)
+          CALL libxsmm_xotrans(C_LOC(output), C_LOC(input),             &
+     &      8, m, n, ldi, ldo)
+        END SUBROUTINE
+
+        !> Transpose a matrix (out-of-place form, DP/rank-2).
+        SUBROUTINE libxsmm_otrans_d2(output, input, m, n, ldi, ldo)
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo
+          REAL(C_DOUBLE), INTENT(OUT), TARGET :: output(ldo,*)
+          REAL(C_DOUBLE), INTENT(IN),  TARGET ::  input(ldi,*)
+          CALL libxsmm_xotrans(C_LOC(output), C_LOC(input),             &
+     &      8, m, n, ldi, ldo)
+        END SUBROUTINE
+
+        !> Transpose a matrix (out-of-place form, SP/rank-1).
+        SUBROUTINE libxsmm_otrans_s1(output, input, m, n, ldi, ldo)
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo
+          REAL(C_FLOAT), INTENT(OUT), TARGET :: output(*)
+          REAL(C_FLOAT), INTENT(IN),  TARGET ::  input(*)
+          CALL libxsmm_xotrans(C_LOC(output), C_LOC(input),             &
+     &      4, m, n, ldi, ldo)
+        END SUBROUTINE
+
+        !> Transpose a matrix (out-of-place form, SP/rank-2).
+        SUBROUTINE libxsmm_otrans_s2(output, input, m, n, ldi, ldo)
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo
+          REAL(C_FLOAT), INTENT(OUT), TARGET :: output(ldo,*)
+          REAL(C_FLOAT), INTENT(IN),  TARGET ::  input(ldi,*)
+          CALL libxsmm_xotrans(C_LOC(output), C_LOC(input),             &
+     &      4, m, n, ldi, ldo)
+        END SUBROUTINE
+
+        !> Returns the difference between two timer ticks (cycles).
+        !> Implicit FORTRAN 77 interface: subroutine available.
+        PURE FUNCTION libxsmm_timer_ncycles(tick0, tick1)
+          INTEGER(LIBXSMM_TICKINT_KIND), INTENT(IN) :: tick0, tick1
+          INTEGER(LIBXSMM_TICKINT_KIND) :: libxsmm_timer_ncycles
+          INTERFACE
+            PURE SUBROUTINE internal_timer_ncycles(ncycles,             &
+     &      tick0, tick1) BIND(C, NAME="libxsmm_timer_ncycles_")
+              IMPORT :: LIBXSMM_TICKINT_KIND
+              INTEGER(LIBXSMM_TICKINT_KIND), INTENT(IN)  :: tick0, tick1
+              INTEGER(LIBXSMM_TICKINT_KIND), INTENT(OUT) :: ncycles
+            END SUBROUTINE
+          END INTERFACE
+          CALL internal_timer_ncycles(                                  &
+     &      libxsmm_timer_ncycles, tick0, tick1)
+          END FUNCTION
+
+        !> Utility function to calculate a collection of scalar differences
+        !> between two matrices (libxsmm_matdiff_info). The location (m, n)
+        !> of the largest difference (linf_abs) is recorded (also if NaN).
+        !> In case of NaN, differences are set to infinity. If no difference
+        !> is discovered, the location (m, n) is negative (OOB).
+        !> Implicit FORTRAN 77 interface:
+        !> TYPE         :: info
+        !> INTEGER(4)   :: datatype
+        !> INTEGER(4|8) :: m, n, ldref, ldtst
+        !> ARRAY        :: ref, tst
+        PURE SUBROUTINE libxsmm_matdiff(info, datatype, m, n,           &
+     &  ref, tst, ldref, ldtst)
+          INTEGER(C_INT),                INTENT(IN) :: datatype
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m
+          INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN),                    &
+     &                                     OPTIONAL :: n, ldref, ldtst
+          TYPE(C_PTR), INTENT(IN),         OPTIONAL :: ref, tst
+          TYPE(LIBXSMM_MATDIFF_INFO),   INTENT(OUT) :: info
+          INTERFACE
+            PURE SUBROUTINE internal_matdiff(info, datatype, m, n,      &
+     &      ref, tst, ldref, ldtst) BIND(C, NAME="libxsmm_matdiff_")
+              IMPORT :: LIBXSMM_MATDIFF_INFO, LIBXSMM_BLASINT_KIND
+              IMPORT :: C_PTR, C_INT
+              INTEGER(C_INT), INTENT(IN)                :: datatype
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n
+              INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldref, ldtst
+              TYPE(C_PTR), INTENT(IN), VALUE            :: ref, tst
+              TYPE(LIBXSMM_MATDIFF_INFO),   INTENT(OUT) :: info
+            END SUBROUTINE
+          END INTERFACE
+          CALL internal_matdiff(info, datatype, m, n,                   &
+     &      ref, tst, ldref, ldtst)
+        END SUBROUTINE
+
+        !> Calculate co-prime number <= n/2 (except: libxsmm_shuffle(0|1) == 0).
+        !> Implicit FORTRAN 77 interface:
+        !> INTEGER(4) :: coprime (OUT)
+        !> INTEGER(4) :: n
+        ELEMENTAL FUNCTION libxsmm_shuffle(n)
+          INTEGER(C_LONG_LONG) :: libxsmm_shuffle
+          INTEGER(C_INT), INTENT(IN) :: n
+          INTERFACE
+            PURE SUBROUTINE internal_shuffle(coprime, n)                &
+     &      BIND(C, NAME="libxsmm_shuffle_")
+              IMPORT :: C_LONG_LONG, C_INT
+              INTEGER(C_LONG_LONG), INTENT(OUT) :: coprime
+              INTEGER(C_INT), INTENT(IN) :: n
+            END SUBROUTINE
+          END INTERFACE
+          libxsmm_shuffle = INT(0, KIND=C_LONG_LONG) ! avoid warning (older CRAY)
+          CALL internal_shuffle(libxsmm_shuffle, n)
+        END FUNCTION
+
+        !> Calculates a hash value for the given array and seed.
+        !> FORTRAN 77: see libxsmm_xhash
+        FUNCTION libxsmm_hash_char(key, seed)
+          CHARACTER(C_CHAR), INTENT(IN) :: key(:)
+          INTEGER(C_INT), INTENT(IN) :: seed
+          INTEGER(C_INT) :: libxsmm_hash_char
+          libxsmm_hash_char = seed
+          CALL libxsmm_xhash(libxsmm_hash_char,                         &
+     &      libxsmm_ptr(key), SIZE(key))
+        END FUNCTION
+
+        !> Calculates a hash value for the given array and seed.
+        !> FORTRAN 77: see libxsmm_xhash
+        FUNCTION libxsmm_hash_i8(key, seed)
+          INTEGER(C_INT8_T), INTENT(IN) :: key(:)
+          INTEGER(C_INT), INTENT(IN) :: seed
+          INTEGER(C_INT) :: libxsmm_hash_i8
+          libxsmm_hash_i8 = seed
+          CALL libxsmm_xhash(libxsmm_hash_i8,                           &
+     &      libxsmm_ptr(key), SIZE(key))
+        END FUNCTION
+
+        !> Calculates a hash value for the given array and seed.
+        !> FORTRAN 77: see libxsmm_xhash
+        FUNCTION libxsmm_hash_i32(key, seed)
+          INTEGER(C_INT), INTENT(IN) :: key(:)
+          INTEGER(C_INT), INTENT(IN) :: seed
+          INTEGER(C_INT) :: libxsmm_hash_i32
+          libxsmm_hash_i32 = seed
+          CALL libxsmm_xhash(libxsmm_hash_i32,                          &
+     &      libxsmm_ptr(key), SIZE(key) * 4)
+        END FUNCTION
+
+        !> Calculates a hash value for the given array and seed.
+        !> FORTRAN 77: see libxsmm_xhash
+        FUNCTION libxsmm_hash_i64(key, seed)
+          INTEGER(C_LONG_LONG), INTENT(IN) :: key(:)
+          INTEGER(C_INT), INTENT(IN) :: seed
+          INTEGER(C_INT) :: libxsmm_hash_i64
+          libxsmm_hash_i64 = seed
+          CALL libxsmm_xhash(libxsmm_hash_i64,                          &
+     &      libxsmm_ptr(key), SIZE(key) * 8)
+        END FUNCTION
+
+        !> Calculates if there is a difference between two arrays.
+        !> FORTRAN 77: see libxsmm_xdiff
+        FUNCTION libxsmm_diff_char(a, b)
+          CHARACTER(C_CHAR), INTENT(IN) :: a(:), b(:)
+          LOGICAL(C_BOOL) :: libxsmm_diff_char
+          IF (SIZE(a, KIND=C_LONG_LONG) .EQ. SIZE(b, KIND=C_LONG_LONG)) &
+     &    THEN
+            CALL libxsmm_xdiff(libxsmm_diff_char,                       &
+     &        libxsmm_ptr(a), libxsmm_ptr(b),                           &
+     &        SIZE(a, KIND=C_LONG_LONG))
+          ELSE
+            libxsmm_diff_char = LOGICAL(.TRUE., KIND=C_BOOL)
+          END IF
+        END FUNCTION
+
+        !> Calculates if there is a difference between two arrays.
+        !> FORTRAN 77: see libxsmm_xdiff
+        FUNCTION libxsmm_diff_i8(a, b)
+          INTEGER(C_INT8_T), INTENT(IN) :: a(:), b(:)
+          LOGICAL(C_BOOL) :: libxsmm_diff_i8
+          IF (SIZE(a, KIND=C_LONG_LONG) .EQ. SIZE(b, KIND=C_LONG_LONG)) &
+     &    THEN
+            CALL libxsmm_xdiff(libxsmm_diff_i8,                         &
+     &        libxsmm_ptr(a), libxsmm_ptr(b),                           &
+     &        SIZE(a, KIND=C_LONG_LONG))
+          ELSE
+            libxsmm_diff_i8 = LOGICAL(.TRUE., KIND=C_BOOL)
+          END IF
+        END FUNCTION
+
+        !> Calculates if there is a difference between two arrays.
+        !> FORTRAN 77: see libxsmm_xdiff
+        FUNCTION libxsmm_diff_i32(a, b)
+          INTEGER(C_INT), INTENT(IN) :: a(:), b(:)
+          LOGICAL(C_BOOL) :: libxsmm_diff_i32
+          IF (SIZE(a, KIND=C_LONG_LONG) .EQ. SIZE(b, KIND=C_LONG_LONG)) &
+     &    THEN
+            CALL libxsmm_xdiff(libxsmm_diff_i32,                        &
+     &        libxsmm_ptr(a), libxsmm_ptr(b),                           &
+     &        SIZE(a, KIND=C_LONG_LONG) * INT(4, KIND=C_LONG_LONG))
+          ELSE
+            libxsmm_diff_i32 = LOGICAL(.TRUE., KIND=C_BOOL)
+          END IF
+        END FUNCTION
+
+        !> Calculates if there is a difference between two arrays.
+        !> FORTRAN 77: see libxsmm_xdiff
+        FUNCTION libxsmm_diff_i64(a, b)
+          INTEGER(C_LONG_LONG), INTENT(IN) :: a(:), b(:)
+          LOGICAL(C_BOOL) :: libxsmm_diff_i64
+          IF (SIZE(a, KIND=C_LONG_LONG) .EQ. SIZE(b, KIND=C_LONG_LONG)) &
+     &    THEN
+            CALL libxsmm_xdiff(libxsmm_diff_i64,                        &
+     &        libxsmm_ptr(a), libxsmm_ptr(b),                           &
+     &        SIZE(a, KIND=C_LONG_LONG) * INT(8, KIND=C_LONG_LONG))
+          ELSE
+            libxsmm_diff_i64 = LOGICAL(.TRUE., KIND=C_BOOL)
+          END IF
+        END FUNCTION
+
+        !> Check if location is SIMD-aligned and optionally consider the next
+        !> access as if reached by incrementing the location (in Bytes).
+        !> Optionally calculates the alignment of the given location in Bytes.
+        FUNCTION libxsmm_aligned(location, increment, alignment)
+          TYPE(C_PTR), INTENT(IN), VALUE :: location
+          INTEGER(C_INT),  INTENT(IN), OPTIONAL :: increment
+          INTEGER(C_INT), INTENT(OUT), OPTIONAL :: alignment
+          LOGICAL :: libxsmm_aligned ! C_BOOL (GNU Fortran issue)
+          INTEGER(C_INT) :: aligned
+          INTERFACE
+            SUBROUTINE internal_aligned(is_aligned, location,           &
+     &      increment, alignment) BIND(C, NAME="libxsmm_aligned_")
+              IMPORT :: C_PTR, C_INT, C_BOOL
+              TYPE(C_PTR), VALUE, INTENT(IN) :: location
+              INTEGER(C_INT),     INTENT(IN) :: increment
+              INTEGER(C_INT),    INTENT(OUT) :: alignment
+              INTEGER(C_INT),    INTENT(OUT) :: is_aligned ! C_BOOL
+            END SUBROUTINE
+          END INTERFACE
+          CALL internal_aligned(aligned, location, increment, alignment)
+          libxsmm_aligned = 0.NE.aligned
+        END FUNCTION
+      END MODULE
+
+
+
--- a/third_party/libxsmm/include/libxsmm.mod
+++ b/third_party/libxsmm/include/libxsmm.mod
--- a/third_party/libxsmm/include/libxsmm_config.h
+++ b/third_party/libxsmm/include/libxsmm_config.h
+#ifndef LIBXSMM_CONFIG_H
+#define LIBXSMM_CONFIG_H
+
+#if !defined(LIBXSMM_DEFAULT_CONFIG) && defined(LIBXSMM_SOURCE_H) && !defined(LIBXSMM_CONFIGURED)
+#  define LIBXSMM_DEFAULT_CONFIG
+#endif
+#if !defined(LIBXSMM_DEFAULT_CONFIG) && defined(_WIN32)
+#  define LIBXSMM_DEFAULT_CONFIG
+#endif
+
+#if !defined(LIBXSMM_DEFAULT_CONFIG) && (!defined(LIBXSMM_SOURCE_H) || defined(LIBXSMM_CONFIGURED))
+#  include "libxsmm_version.h"
+
+
+#else
+#  define LIBXSMM_CONFIG_VERSION ""
+#  define LIBXSMM_CONFIG_BRANCH ""
+#  define LIBXSMM_CONFIG_VERSION_MAJOR INT_MAX
+#  define LIBXSMM_CONFIG_VERSION_MINOR INT_MAX
+#  define LIBXSMM_CONFIG_VERSION_UPDATE INT_MAX
+#  define LIBXSMM_CONFIG_VERSION_PATCH INT_MAX
+#  define LIBXSMM_CONFIG_BUILD_DATE INT_MAX
+#endif
+
+#define LIBXSMM_CONFIG_CACHELINE 64
+#define LIBXSMM_CONFIG_ALIGNMENT 64
+#define LIBXSMM_CONFIG_MALLOC 0
+#define LIBXSMM_CONFIG_ILP64 0
+#define LIBXSMM_CONFIG_SYNC 1
+#define LIBXSMM_CONFIG_JIT 1
+
+#define LIBXSMM_CONFIG_PREFETCH -1
+#define LIBXSMM_CONFIG_MAX_MNK 262144
+#define LIBXSMM_CONFIG_MAX_DIM 64
+#define LIBXSMM_CONFIG_AVG_DIM 32
+#define LIBXSMM_CONFIG_MAX_M 64
+#define LIBXSMM_CONFIG_MAX_N 64
+#define LIBXSMM_CONFIG_MAX_K 64
+#define LIBXSMM_CONFIG_FLAGS 0
+#define LIBXSMM_CONFIG_ALPHA 1
+#define LIBXSMM_CONFIG_BETA 1
+#define LIBXSMM_CONFIG_WRAP 1
+
+#endif
+
--- a/third_party/libxsmm/include/libxsmm_cpuid.h
+++ b/third_party/libxsmm/include/libxsmm_cpuid.h
+/******************************************************************************
+* Copyright (c) Intel Corporation - All rights reserved.                      *
+* This file is part of the LIBXSMM library.                                   *
+*                                                                             *
+* For information on the license, see the LICENSE file.                       *
+* Further information: https://github.com/hfp/libxsmm/                        *
+* SPDX-License-Identifier: BSD-3-Clause                                       *
+******************************************************************************/
+/* Hans Pabst (Intel Corp.)
+******************************************************************************/
+#ifndef LIBXSMM_CPUID_H
+#define LIBXSMM_CPUID_H
+
+#include "libxsmm_macros.h"
+
+/**
+ * Enumerates the available target architectures and instruction
+ * set extensions as returned by libxsmm_get_target_archid().
+ * LIBXSMM_X86_ALLFEAT: pseudo-value enabling all features
+ * used anywhere in LIBXSMM (never set as an architecture,
+ * used as an upper bound in comparisons to distinct x86).
+ */
+#define LIBXSMM_TARGET_ARCH_UNKNOWN 0
+#define LIBXSMM_TARGET_ARCH_GENERIC 1
+#define LIBXSMM_X86_GENERIC      1002
+#define LIBXSMM_X86_SSE3         1003
+#define LIBXSMM_X86_SSE42        1004
+#define LIBXSMM_X86_AVX          1005
+#define LIBXSMM_X86_AVX2         1006
+#define LIBXSMM_X86_AVX512       1007
+#define LIBXSMM_X86_AVX512_MIC   1010 /* KNL */
+#define LIBXSMM_X86_AVX512_KNM   1011
+#define LIBXSMM_X86_AVX512_CORE  1020 /* SKX */
+#define LIBXSMM_X86_AVX512_CLX   1021
+#define LIBXSMM_X86_AVX512_CPX   1022
+#define LIBXSMM_X86_AVX512_SPR   1023
+#define LIBXSMM_X86_ALLFEAT      1999
+#define LIBXSMM_AARCH64_V81      2001 /* Baseline */
+#define LIBXSMM_AARCH64_V82      2002 /* A64FX minus SVE */
+#define LIBXSMM_AARCH64_A64FX    2100 /* SVE */
+#define LIBXSMM_AARCH64_ALLFEAT  2999
+
+#if defined(LIBXSMM_PLATFORM_X86)
+/** Zero-initialized structure; assumes conservative properties. */
+LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_cpuid_info {
+  int constant_tsc; /** Timer stamp counter is monotonic. */
+  int has_context;  /** Context switches are permitted. */
+} libxsmm_cpuid_info;
+#else
+typedef int libxsmm_cpuid_info;
+#endif
+
+/** Returns the target architecture and instruction set extensions. */
+#if defined(__cplusplus) /* note: stay compatible with TF */
+LIBXSMM_API int libxsmm_cpuid_x86(libxsmm_cpuid_info* info = NULL);
+LIBXSMM_API int libxsmm_cpuid_arm(libxsmm_cpuid_info* info = NULL);
+#else
+LIBXSMM_API int libxsmm_cpuid_x86(libxsmm_cpuid_info* info);
+LIBXSMM_API int libxsmm_cpuid_arm(libxsmm_cpuid_info* info);
+#endif
+
+/**
+ * Similar to libxsmm_cpuid_x86, but conceptually not x86-specific.
+ * The actual code path (as used by LIBXSMM) is determined by
+ * libxsmm_[get|set]_target_archid/libxsmm_[get|set]_target_arch.
+ */
+LIBXSMM_API int libxsmm_cpuid(void);
+
+/** Names the CPU architecture given by CPUID. */
+LIBXSMM_API const char* libxsmm_cpuid_name(int id);
+
+/** SIMD vector length (VLEN) in 32-bit elements. */
+LIBXSMM_API int libxsmm_cpuid_vlen32(int id);
+
+#endif /*LIBXSMM_CPUID_H*/
+
--- a/third_party/libxsmm/include/libxsmm_dnn.h
+++ b/third_party/libxsmm/include/libxsmm_dnn.h
+/******************************************************************************
+* Copyright (c) Intel Corporation - All rights reserved.                      *
+* This file is part of the LIBXSMM library.                                   *
+*                                                                             *
+* For information on the license, see the LICENSE file.                       *
+* Further information: https://github.com/hfp/libxsmm/                        *
+* SPDX-License-Identifier: BSD-3-Clause                                       *
+******************************************************************************/
+/* Alexander Heinecke, Hans Pabst (Intel Corp.)
+******************************************************************************/
+#ifndef LIBXSMM_DNN_H
+#define LIBXSMM_DNN_H
+
+#include "libxsmm_typedefs.h"
+
+typedef unsigned int libxsmm_dnn_err_t;
+
+/** Define error and warning codes */
+#define LIBXSMM_DNN_SUCCESS                             0
+
+#define LIBXSMM_DNN_WARN_FALLBACK                   90000
+#define LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_N_BLOCKING  90001
+#define LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_C_BLOCKING  90002
+#define LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_K_BLOCKING  90003
+#define LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_N_BLOCKING   90004
+#define LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_C_BLOCKING   90005
+#define LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_K_BLOCKING   90006
+
+#define LIBXSMM_DNN_ERR_GENERAL                    100000
+#define LIBXSMM_DNN_ERR_CREATE_HANDLE              100001
+#define LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE       100002
+#define LIBXSMM_DNN_ERR_INVALID_BLOCKING           100003
+#define LIBXSMM_DNN_ERR_INVALID_HANDLE             100004
+#define LIBXSMM_DNN_ERR_DATA_NOT_BOUND             100005
+#define LIBXSMM_DNN_ERR_CREATE_TENSOR              100006
+#define LIBXSMM_DNN_ERR_INVALID_TENSOR             100007
+#define LIBXSMM_DNN_ERR_MISMATCH_TENSOR            100008
+#define LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR      100009
+#define LIBXSMM_DNN_ERR_INVALID_KIND               100010
+#define LIBXSMM_DNN_ERR_INVALID_FORMAT_NCHW        100011
+#define LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT     100012
+#define LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT     100013
+#define LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE    100014
+#define LIBXSMM_DNN_ERR_INVALID_FORMAT_KCRS        100015
+#define LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL     100016
+#define LIBXSMM_DNN_ERR_CREATE_LAYOUT              100017
+#define LIBXSMM_DNN_ERR_INVALID_LAYOUT             100018
+#define LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH           100019
+#define LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED        100020
+#define LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE        100021
+#define LIBXSMM_DNN_ERR_INVALID_ALGO               100022
+#define LIBXSMM_DNN_ERR_INVALID_PADDING            100023
+#define LIBXSMM_DNN_ERR_UNKNOWN_BIAS_TYPE          100024
+#define LIBXSMM_DNN_ERR_MISMATCH_BIAS              100025
+#define LIBXSMM_DNN_ERR_INVALID_HANDLE_BIAS        100026
+#define LIBXSMM_DNN_ERR_TIME_STEPS_TOO_SMALL       100027
+#define LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS       100028
+#define LIBXSMM_DNN_ERR_NOT_IMPLEMENTED            100029
+#define LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER  100030
+#define LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION 100031
+#define LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN     100032
+#define LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING        100033
+#define LIBXSMM_DNN_ERR_INVALID_FORMAT_FC          100034
+#define LIBXSMM_DNN_ERR_INVALID_RNN_TYPE           100035
+#define LIBXSMM_DNN_ERR_RNN_INVALID_SEQ_LEN        100036
+#define LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER  100037
+#define LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION 100038
+#define LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION      100039
+
+/** Kinds of supported compute flavor operations. */
+typedef enum libxsmm_dnn_compute_kind {
+  /** Forward path */
+  LIBXSMM_DNN_COMPUTE_KIND_FWD,
+  /** Backward path */
+  LIBXSMM_DNN_COMPUTE_KIND_BWD,
+  /** Updated weights. */
+  LIBXSMM_DNN_COMPUTE_KIND_UPD,
+  /** Backward and weightupdate combined, useful for RNNs */
+  LIBXSMM_DNN_COMPUTE_KIND_BWDUPD,
+  /** All routines, need for some init routines. */
+  LIBXSMM_DNN_COMPUTE_KIND_ALL
+} libxsmm_dnn_compute_kind;
+
+/** these are some quantization definitions, not sure if we want to
+    move them into some main part of LIBXSMM */
+/* @TODO check position of these declarations and defines */
+typedef union LIBXSMM_RETARGETABLE libxsmm_intfloat {
+  unsigned int ui;
+  float f;
+} libxsmm_intfloat;
+
+/* F32 masking defines */
+#define LIBXSNN_DNN_MASK_SIGN_F32      0x80000000
+#define LIBXSMM_DNN_MASK_EXP_F32       0x7f800000
+#define LIBXSMM_DNN_MASK_MANT_F32      0x007fffff
+#define LIBXSMM_DNN_MASK_ABS_F32       0x7fffffff
+#define LIBXSMM_DNN_MASK_FULL_F32      0xffffffff
+#define LIBXSMM_DNN_MANT_SZ_F32        23
+#define LIBXSMM_DNN_SZ_F32             32
+
+/* DFP16 masking defines */
+#define LIBXSMM_DNN_MANT_DFP16         15
+#define LIXSMMM_DNN_RES_DFP16          libxsmm_sexp2_i8i(-(LIBXSMM_DNN_MANT_DFP16))
+
+/* Quantization Rounding Defines */
+#define LIBXSMM_DNN_QUANT_NO_ROUND       80000
+#define LIBXSMM_DNN_QUANT_BIAS_ROUND     80001
+#define LIBXSMM_DNN_QUANT_STOCH_ROUND    80002
+#define LIBXSMM_DNN_QUANT_NEAREST_ROUND  80003
+#define LIBXSMM_DNN_QUANT_FPHW_ROUND     80004
+
+/** get string of error code */
+LIBXSMM_API const char* libxsmm_dnn_get_error(libxsmm_dnn_err_t code);
+LIBXSMM_API size_t libxsmm_dnn_typesize(libxsmm_dnn_datatype datatype);
+LIBXSMM_API size_t libxsmm_dnn_get_simd_width(libxsmm_dnn_datatype datatype);
+
+/** some quantization helper functions,
+    @TODO need to be integrated better for all different ways of quantizations */
+LIBXSMM_API void libxsmm_dnn_quantize( float* in_buffer, short* out_buffer, int length, unsigned char add_shift, unsigned char* scf, int round_mode );
+LIBXSMM_API void libxsmm_dnn_quantize_act( float* in_buffer, short* out_buffer, unsigned int N, unsigned int C, unsigned int H, unsigned int W, unsigned int cblk_f32, unsigned int cblk_i16, unsigned int lp_blk, unsigned char add_shift, unsigned char* scf, int round_mode );
+LIBXSMM_API void libxsmm_dnn_quantize_fil( float* in_buffer, short* out_buffer, unsigned int K, unsigned int C, unsigned int R, unsigned int S, unsigned int cblk_f32, unsigned int cblk_i16, unsigned int kblk_f32, unsigned int kblk_i16, unsigned int lp_blk, unsigned char add_shift, unsigned char* scf, int round_mode );
+LIBXSMM_API void libxsmm_dnn_dequantize( short* in_buffer, float* out_buffer, int length, unsigned char scf );
+
+/** some BF16<->FP32 conversion functions
+    @TODO we need to find a final place for those */
+LIBXSMM_API void libxsmm_truncate_convert_f32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int length);
+LIBXSMM_API void libxsmm_rnaz_convert_fp32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int len);
+LIBXSMM_API void libxsmm_rne_convert_fp32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int len);
+LIBXSMM_API void libxsmm_convert_bf16_f32(const libxsmm_bfloat16* in, float* out, unsigned int length);
+
+#endif /*LIBXSMM_DNN_H*/
+
--- a/third_party/libxsmm/include/libxsmm_dnn_convolution.h
+++ b/third_party/libxsmm/include/libxsmm_dnn_convolution.h
+/******************************************************************************
+* Copyright (c) Intel Corporation - All rights reserved.                      *
+* This file is part of the LIBXSMM library.                                   *
+*                                                                             *
+* For information on the license, see the LICENSE file.                       *
+* Further information: https://github.com/hfp/libxsmm/                        *
+* SPDX-License-Identifier: BSD-3-Clause                                       *
+******************************************************************************/
+/* Alexander Heinecke (Intel Corp.)
+******************************************************************************/
+#ifndef LIBXSMM_DNN_CONVOLUTION_H
+#define LIBXSMM_DNN_CONVOLUTION_H
+
+#include "libxsmm_dnn.h"
+#include "libxsmm_dnn_tensor.h"
+#include "libxsmm_dnn_fusedbatchnorm.h"
+
+/** Opaque handles which represents convolutions and LIBXSMM datatypes */
+LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_layer libxsmm_dnn_layer;
+
+typedef enum libxsmm_dnn_conv_fuse_op {
+  /* we fuse nothing into convolution */
+  LIBXSMM_DNN_CONV_FUSE_NONE = 0
+} libxsmm_dnn_conv_fuse_op;
+
+/** Type of algorithm used for convolutions. */
+typedef enum libxsmm_dnn_conv_algo {
+  /** let the library decide */
+  LIBXSMM_DNN_CONV_ALGO_AUTO,
+  /** direct convolution. */
+  LIBXSMM_DNN_CONV_ALGO_DIRECT
+} libxsmm_dnn_conv_algo;
+
+/** Structure which describes the input and output of data (DNN). */
+LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_conv_desc {
+  int N;                                    /* number of images in mini-batch */
+  int C;                                    /* number of input feature maps */
+  int H;                                    /* height of input image */
+  int W;                                    /* width of input image */
+  int K;                                    /* number of output feature maps */
+  int R;                                    /* height of filter kernel */
+  int S;                                    /* width of filter kernel */
+  int u;                                    /* vertical stride */
+  int v;                                    /* horizontal stride */
+  int pad_h;                                /* height of logical rim padding to input
+                                               for adjusting output height */
+  int pad_w;                                /* width of logical rim padding to input
+                                               for adjusting output width */
+  int pad_h_in;                             /* height of zero-padding in input buffer,
+                                               must equal to pad_h for direct conv */
+  int pad_w_in;                             /* width of zero-padding in input buffer,
+                                               must equal to pad_w for direct conv */
+  int pad_h_out;                            /* height of zero-padding in output buffer */
+  int pad_w_out;                            /* width of zero-padding in output buffer */
+  int threads;                              /* number of threads to use when running
+                                               convolution */
+  libxsmm_dnn_datatype datatype_in;         /* datatypes used for all input related buffer */
+  libxsmm_dnn_datatype datatype_out;        /* datatypes used for all output related buffer */
+  libxsmm_dnn_tensor_format buffer_format;  /* format which is for buffer buffers */
+  libxsmm_dnn_tensor_format filter_format;  /* format which is for filter buffers */
+  libxsmm_dnn_conv_algo algo;               /* convolution algorithm used */
+  libxsmm_dnn_conv_option options;          /* additional options */
+  libxsmm_dnn_conv_fuse_op fuse_ops;        /* used ops into convolutions */
+} libxsmm_dnn_conv_desc;
+
+/** Create a layer handle (non-NULL if successful), and pre-build all JIT-code versions. */
+LIBXSMM_API libxsmm_dnn_layer* libxsmm_dnn_create_conv_layer(libxsmm_dnn_conv_desc conv_desc, libxsmm_dnn_err_t* status);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_conv_layer(const libxsmm_dnn_layer* handle);
+
+/** get layout description of buffers and filters from handle */
+LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_create_tensor_datalayout(const libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
+
+/** scratch pad management */
+LIBXSMM_API size_t libxsmm_dnn_get_scratch_size(const libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_bind_scratch(libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind, const void* scratch);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_release_scratch(libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind);
+
+/** Bind/Release buffers, filters and bias to layer operation */
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_bind_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type);
+LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_get_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_release_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type);
+
+/** Run the layer identified by the handle; may use threads internally. */
+LIBXSMM_API void libxsmm_dnn_execute(libxsmm_dnn_layer* handle, libxsmm_dnn_compute_kind kind);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_execute_st(libxsmm_dnn_layer* handle, libxsmm_dnn_compute_kind kind,
+  /*unsigned*/int start_thread, /*unsigned*/int tid);
+
+/** some helper functions for framework integration */
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_trans_reg_filter(const libxsmm_dnn_layer* handle);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_trans_reg_bf16_filter(const libxsmm_dnn_layer* handle);
+
+#endif /*LIBXSMM_DNN_CONVOLUTION_H*/
+
--- a/third_party/libxsmm/include/libxsmm_dnn_fullyconnected.h
+++ b/third_party/libxsmm/include/libxsmm_dnn_fullyconnected.h
+/******************************************************************************
+* Copyright (c) Intel Corporation - All rights reserved.                      *
+* This file is part of the LIBXSMM library.                                   *
+*                                                                             *
+* For information on the license, see the LICENSE file.                       *
+* Further information: https://github.com/hfp/libxsmm/                        *
+* SPDX-License-Identifier: BSD-3-Clause                                       *
+******************************************************************************/
+/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.)
+******************************************************************************/
+#ifndef LIBXSMM_DNN_FULLYCONNECTED_H
+#define LIBXSMM_DNN_FULLYCONNECTED_H
+
+#include "libxsmm_dnn.h"
+#include "libxsmm_dnn_tensor.h"
+
+/** Opaque handles which represents LIBXSMM fullyconnected */
+LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fullyconnected libxsmm_dnn_fullyconnected;
+
+typedef enum libxsmm_dnn_fullyconnected_fuse_op {
+  /* the fuse order is: 1. BIAS, 2. Actitvation */
+  LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE = 0,
+  LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS = 1,
+  LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU = 2,
+  LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID = 4,
+  LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU = 3,
+  LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID = 5
+} libxsmm_dnn_fullyconnected_fuse_op;
+
+LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fullyconnected_desc {
+  int N;                                        /* number of images in mini-batch */
+  int C;                                        /* number of input feature maps */
+  int K;                                        /* number of output feature maps */
+  int bn;
+  int bk;
+  int bc;
+  int threads;                                  /* number of threads used */
+  int compressed_A;
+  int sparsity_factor_A;
+  libxsmm_dnn_datatype datatype_in;             /* datatype used for all input related buffers */
+  libxsmm_dnn_datatype datatype_out;            /* datatype used for all output related buffers */
+  libxsmm_dnn_tensor_format buffer_format;      /* format which is for activation buffers */
+  libxsmm_dnn_tensor_format filter_format;      /* format which is for filter buffers */
+  libxsmm_dnn_fullyconnected_fuse_op fuse_ops;  /* fused operations */
+} libxsmm_dnn_fullyconnected_desc;
+
+LIBXSMM_API libxsmm_dnn_fullyconnected* libxsmm_dnn_create_fullyconnected(libxsmm_dnn_fullyconnected_desc fullyconnected_desc, libxsmm_dnn_err_t* status);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fullyconnected(const libxsmm_dnn_fullyconnected* handle);
+
+LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fullyconnected_create_tensor_datalayout(const libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
+
+LIBXSMM_API void*  libxsmm_dnn_fullyconnected_get_scratch_ptr (const libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_err_t* status);
+LIBXSMM_API size_t libxsmm_dnn_fullyconnected_get_scratch_size(const libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_err_t* status);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_bind_scratch(libxsmm_dnn_fullyconnected* handle, const void* scratch);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_release_scratch(libxsmm_dnn_fullyconnected* handle);
+
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_bind_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type);
+LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fullyconnected_get_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_release_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type);
+
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_execute_st(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind,
+  /*unsigned*/int start_thread, /*unsigned*/int tid);
+
+#endif /*LIBXSMM_DNN_FULLYCONNECTED_H*/
+
--- a/third_party/libxsmm/include/libxsmm_dnn_fusedbatchnorm.h
+++ b/third_party/libxsmm/include/libxsmm_dnn_fusedbatchnorm.h
+/******************************************************************************
+* Copyright (c) Intel Corporation - All rights reserved.                      *
+* This file is part of the LIBXSMM library.                                   *
+*                                                                             *
+* For information on the license, see the LICENSE file.                       *
+* Further information: https://github.com/hfp/libxsmm/                        *
+* SPDX-License-Identifier: BSD-3-Clause                                       *
+******************************************************************************/
+/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.)
+******************************************************************************/
+#ifndef LIBXSMM_DNN_FUSEDBATCHNORM_H
+#define LIBXSMM_DNN_FUSEDBATCHNORM_H
+
+#include "libxsmm_dnn.h"
+#include "libxsmm_dnn_tensor.h"
+
+/** Opaque handles which represents LIBXSMM fusedbatchnorm */
+LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fusedbatchnorm libxsmm_dnn_fusedbatchnorm;
+
+LIBXSMM_API libxsmm_dnn_fusedbatchnorm* libxsmm_dnn_create_fusedbatchnorm(libxsmm_dnn_fusedbatchnorm_desc fusedbatchnorm_desc, libxsmm_dnn_err_t* status);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fusedbatchnorm(const libxsmm_dnn_fusedbatchnorm* handle);
+
+LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(const libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
+
+LIBXSMM_API size_t libxsmm_dnn_fusedbatchnorm_get_scratch_size(const libxsmm_dnn_fusedbatchnorm* handle, libxsmm_dnn_err_t* status);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_bind_scratch(libxsmm_dnn_fusedbatchnorm* handle, const void* scratch);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_release_scratch(libxsmm_dnn_fusedbatchnorm* handle);
+
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type);
+LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fusedbatchnorm_get_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_release_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type);
+
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_execute_st(libxsmm_dnn_fusedbatchnorm* handle, libxsmm_dnn_compute_kind kind,
+  /*unsigned*/int start_thread, /*unsigned*/int tid);
+LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_reduce_stats_st(libxsmm_dnn_fusedbatchnorm** handles, int num_handles, libxsmm_dnn_compute_kind kind,
+  /*unsigned*/int start_thread, /*unsigned*/int tid);
+
+#endif /*LIBXSMM_DNN_FUSEDBATCHNORM_H*/
+