删除子模块的gitignore

c454d419 · lisj · 3359c1f1 · 3359c1f1 · 3359c1f1 · 3359c1f1
Commit c454d419 authored May 12, 2023 by lisj
20 changed files
--- a/third_party/METIS/.gitignore
+++ b/third_party/METIS/.gitignore
-# Prerequisites
-*.d
-
-# Object files
-*.o
-*.ko
-*.obj
-*.elf
-
-# Linker output
-*.ilk
-*.map
-*.exp
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Libraries
-*.lib
-*.a
-*.la
-*.lo
-
-# Shared objects (inc. Windows DLLs)
-*.dll
-*.so
-*.so.*
-*.dylib
-
-# Executables
-*.exe
-*.out
-*.app
-*.i*86
-*.x86_64
-*.hex
-
-# Debug files
-*.dSYM/
-*.su
-*.idb
-*.pdb
-
-# Kernel Module Compile Results
-*.mod*
-*.cmd
-.tmp_versions/
-modules.order
-Module.symvers
-Mkfile.old
-dkms.conf
-
-# GK things
-build/
-graphs/*.part.*
-graphs/*.iperm
-graphs/*.epart.*
-graphs/*.npart.*
-.svn/
-
--- a/third_party/dlpack/.gitignore
+++ b/third_party/dlpack/.gitignore
-# Compiled Object files
-*.slo
-*.lo
-*.o
-*.obj
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Compiled Dynamic libraries
-*.so
-*.dylib
-*.dll
-
-# Fortran module files
-*.mod
-*.smod
-
-# Compiled Static libraries
-*.lai
-*.la
-*.a
-*.lib
-
-# Executables
-*.exe
-*.out
-*.app
-*~
-build
-bin
--- a/third_party/dmlc-core/.gitignore
+++ b/third_party/dmlc-core/.gitignore
-# Compiled Object files
-*.slo
-*.lo
-*.o
-*.obj
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Compiled Dynamic libraries
-*.so
-*.dylib
-*.dll
-
-# Fortran module files
-*.mod
-
-# Compiled Static libraries
-*.lai
-*.la
-*.a
-*.lib
-
-# Executables
-*.exe
-*.out
-*.app
-*~
-config.mk
-*.pyc
-
-# Vim
-*.swp
-*.swo
-*.swn
-*.csv
-.vimrc
-
-# Emacs
-.clang_complete
-deps
-recommonmark
-build
-
-# CLion
-.idea
-cmake-build-*
--- a/third_party/dmlc-core/make/config.mk
+++ b/third_party/dmlc-core/make/config.mk
+#-----------------------------------------------------
+#  dmlc-core: the configuration compile script
+#
+#  This is the default configuration setup for
+#  If you want to change configuration, do the following steps:
+#
+#  - copy this file to the root of dmlc-core folder
+#  - modify the configuration you want
+#  - type make or make -j n on each of the folder
+#----------------------------------------------------
+
+# choice of compiler
+export CC = gcc
+export CXX = g++
+export MPICXX = mpicxx
+
+# choice of archiver
+export AR = ar
+
+# the additional link flags you want to add
+ADD_LDFLAGS =
+
+# the additional compile flags you want to add
+ADD_CFLAGS =
+
+# whether to compile with -fPIC option
+# Note: to build shared library(so files), fPIC is required
+WITH_FPIC = 1
+
+# whether use openmp during compile
+USE_OPENMP = 0
+
+# whether use HDFS support during compile
+USE_HDFS = 0
+
+# whether use AWS S3 support during compile
+USE_S3 = 0
+
+# whether use Azure blob support during compile
+USE_AZURE = 0
+
+# path to libjvm.so
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether building unittest (gtest is required)
+BUILD_TEST=0
+
+# path to gtest library (only used when $BUILD_TEST=1)
+# there should be an include path in $GTEST_PATH/include and library in $GTEST_PATH/lib
+GTEST_PATH=
+
+# path to third-party dependences such as glog
+DEPS_PATH=
--- a/third_party/googletest/.gitignore
+++ b/third_party/googletest/.gitignore
-# Ignore CI build directory
-build/
-xcuserdata
-cmake-build-debug/
-.idea/
-bazel-bin
-bazel-genfiles
-bazel-googletest
-bazel-out
-bazel-testlogs
-# python
-*.pyc
-
-# Visual Studio files
-.vs
-*.sdf
-*.opensdf
-*.VC.opendb
-*.suo
-*.user
-_ReSharper.Caches/
-Win32-Debug/
-Win32-Release/
-x64-Debug/
-x64-Release/
-
-# Ignore autoconf / automake files
-Makefile.in
-aclocal.m4
-configure
-build-aux/
-autom4te.cache/
-googletest/m4/libtool.m4
-googletest/m4/ltoptions.m4
-googletest/m4/ltsugar.m4
-googletest/m4/ltversion.m4
-googletest/m4/lt~obsolete.m4
-googlemock/m4
-
-# Ignore generated directories.
-googlemock/fused-src/
-googletest/fused-src/
-
-# macOS files
-.DS_Store
-googletest/.DS_Store
-googletest/xcode/.DS_Store
-
-# Ignore cmake generated directories and files.
-CMakeFiles
-CTestTestfile.cmake
-Makefile
-cmake_install.cmake
-googlemock/CMakeFiles
-googlemock/CTestTestfile.cmake
-googlemock/Makefile
-googlemock/cmake_install.cmake
-googlemock/gtest
-/bin
-/googlemock/gmock.dir
-/googlemock/gmock_main.dir
-/googlemock/RUN_TESTS.vcxproj.filters
-/googlemock/RUN_TESTS.vcxproj
-/googlemock/INSTALL.vcxproj.filters
-/googlemock/INSTALL.vcxproj
-/googlemock/gmock_main.vcxproj.filters
-/googlemock/gmock_main.vcxproj
-/googlemock/gmock.vcxproj.filters
-/googlemock/gmock.vcxproj
-/googlemock/gmock.sln
-/googlemock/ALL_BUILD.vcxproj.filters
-/googlemock/ALL_BUILD.vcxproj
-/lib
-/Win32
-/ZERO_CHECK.vcxproj.filters
-/ZERO_CHECK.vcxproj
-/RUN_TESTS.vcxproj.filters
-/RUN_TESTS.vcxproj
-/INSTALL.vcxproj.filters
-/INSTALL.vcxproj
-/googletest-distribution.sln
-/CMakeCache.txt
-/ALL_BUILD.vcxproj.filters
-/ALL_BUILD.vcxproj
--- a/third_party/googletest/googlemock/build-aux/.keep
+++ b/third_party/googletest/googlemock/build-aux/.keep
--- a/third_party/googletest/googlemock/make/Makefile
+++ b/third_party/googletest/googlemock/make/Makefile
+# A sample Makefile for building both Google Mock and Google Test and
+# using them in user tests.  This file is self-contained, so you don't
+# need to use the Makefile in Google Test's source tree.  Please tweak
+# it to suit your environment and project.  You may want to move it to
+# your project's root directory.
+#
+# SYNOPSIS:
+#
+#   make [all]  - makes everything.
+#   make TARGET - makes the given target.
+#   make clean  - removes all files generated by make.
+
+# Please tweak the following variable definitions as needed by your
+# project, except GMOCK_HEADERS and GTEST_HEADERS, which you can use
+# in your own targets but shouldn't modify.
+
+# Points to the root of Google Test, relative to where this file is.
+# Remember to tweak this if you move this file, or if you want to use
+# a copy of Google Test at a different location.
+GTEST_DIR = ../../googletest
+
+# Points to the location of the Google Test libraries
+GTEST_LIB_DIR = .
+
+# Points to the root of Google Mock, relative to where this file is.
+# Remember to tweak this if you move this file.
+GMOCK_DIR = ..
+
+# Where to find user code.
+USER_DIR = ../test
+
+# Flags passed to the preprocessor.
+# Set Google Test and Google Mock's header directories as system
+# directories, such that the compiler doesn't generate warnings in
+# these headers.
+CPPFLAGS += -isystem $(GTEST_DIR)/include -isystem $(GMOCK_DIR)/include
+
+# Flags passed to the C++ compiler.
+CXXFLAGS += -g -Wall -Wextra -pthread -std=c++11
+
+# Google Test libraries
+GTEST_LIBS = libgtest.a libgtest_main.a libgmock.a libgmock_main.a
+
+# All tests produced by this Makefile.  Remember to add new tests you
+# created to the list.
+TESTS = gmock_test
+
+# All Google Test headers.  Usually you shouldn't change this
+# definition.
+GTEST_HEADERS = $(GTEST_DIR)/include/gtest/*.h \
+                $(GTEST_DIR)/include/gtest/internal/*.h
+
+# All Google Mock headers. Note that all Google Test headers are
+# included here too, as they are #included by Google Mock headers.
+# Usually you shouldn't change this definition.	
+GMOCK_HEADERS = $(GMOCK_DIR)/include/gmock/*.h \
+                $(GMOCK_DIR)/include/gmock/internal/*.h \
+                $(GTEST_HEADERS)
+
+# House-keeping build targets.
+
+all : $(GTEST_LIBS) $(TESTS)
+
+clean :
+	rm -f $(GTEST_LIBS) $(TESTS) *.o
+
+# Builds gmock.a and gmock_main.a.  These libraries contain both
+# Google Mock and Google Test.  A test should link with either gmock.a
+# or gmock_main.a, depending on whether it defines its own main()
+# function.  It's fine if your test only uses features from Google
+# Test (and not Google Mock).
+
+# Usually you shouldn't tweak such internal variables, indicated by a
+# trailing _.
+GTEST_SRCS_ = $(GTEST_DIR)/src/*.cc $(GTEST_DIR)/src/*.h $(GTEST_HEADERS)
+GMOCK_SRCS_ = $(GMOCK_DIR)/src/*.cc $(GMOCK_HEADERS)
+
+# For simplicity and to avoid depending on implementation details of
+# Google Mock and Google Test, the dependencies specified below are
+# conservative and not optimized.  This is fine as Google Mock and
+# Google Test compile fast and for ordinary users their source rarely
+# changes.
+gtest-all.o : $(GTEST_SRCS_)
+	$(CXX) $(CPPFLAGS) -I$(GTEST_DIR) -I$(GMOCK_DIR) $(CXXFLAGS) \
+            -c $(GTEST_DIR)/src/gtest-all.cc
+
+gtest_main.o : $(GTEST_SRCS_)
+	$(CXX) $(CPPFLAGS) -I$(GTEST_DIR) -I$(GMOCK_DIR) $(CXXFLAGS) \
+            -c $(GTEST_DIR)/src/gtest_main.cc
+
+gmock-all.o : $(GMOCK_SRCS_)
+	$(CXX) $(CPPFLAGS) -I$(GTEST_DIR) -I$(GMOCK_DIR) $(CXXFLAGS) \
+            -c $(GMOCK_DIR)/src/gmock-all.cc
+
+gmock_main.o : $(GMOCK_SRCS_)
+	$(CXX) $(CPPFLAGS) -I$(GTEST_DIR) -I$(GMOCK_DIR) $(CXXFLAGS) \
+            -c $(GMOCK_DIR)/src/gmock_main.cc
+
+libgtest.a : gtest-all.o
+	$(AR) $(ARFLAGS) $@ $^
+
+libgtest_main.a : gtest_main.o
+	$(AR) $(ARFLAGS) $@ $^
+
+libgmock.a : gmock-all.o
+	$(AR) $(ARFLAGS) $@ $^
+
+libgmock_main.a : gmock_main.o
+	$(AR) $(ARFLAGS) $@ $^
+
+# Builds a sample test.
+
+gmock_test.o : $(USER_DIR)/gmock_test.cc $(GMOCK_HEADERS)
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(USER_DIR)/gmock_test.cc
+
+gmock_test : gmock_test.o $(GTEST_LIBS)
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -L$(GTEST_LIB_DIR) -lgmock -lpthread $^ -o $@
--- a/third_party/googletest/googletest/make/Makefile
+++ b/third_party/googletest/googletest/make/Makefile
+# A sample Makefile for building Google Test and using it in user
+# tests.  Please tweak it to suit your environment and project.  You
+# may want to move it to your project's root directory.
+#
+# SYNOPSIS:
+#
+#   make [all]  - makes everything.
+#   make TARGET - makes the given target.
+#   make clean  - removes all files generated by make.
+
+# Please tweak the following variable definitions as needed by your
+# project, except GTEST_HEADERS, which you can use in your own targets
+# but shouldn't modify.
+
+# Points to the root of Google Test, relative to where this file is.
+# Remember to tweak this if you move this file.
+GTEST_DIR = ..
+
+# Points to the location of the Google Test libraries
+GTEST_LIB_DIR = .
+
+# Where to find user code.
+USER_DIR = ../samples
+
+# Flags passed to the preprocessor.
+# Set Google Test's header directory as a system directory, such that
+# the compiler doesn't generate warnings in Google Test headers.
+CPPFLAGS += -isystem $(GTEST_DIR)/include
+
+# Flags passed to the C++ compiler.
+CXXFLAGS += -g -Wall -Wextra -pthread -std=c++11
+
+# Google Test libraries
+GTEST_LIBS = libgtest.a libgtest_main.a
+
+# All tests produced by this Makefile.  Remember to add new tests you
+# created to the list.
+TESTS = sample1_unittest
+
+# All Google Test headers.  Usually you shouldn't change this
+# definition.
+GTEST_HEADERS = $(GTEST_DIR)/include/gtest/*.h \
+                $(GTEST_DIR)/include/gtest/internal/*.h
+
+# House-keeping build targets.
+
+all : $(GTEST_LIBS) $(TESTS)
+
+clean :
+	rm -f $(GTEST_LIBS) $(TESTS) *.o
+
+# Builds gtest.a and gtest_main.a.
+
+# Usually you shouldn't tweak such internal variables, indicated by a
+# trailing _.
+GTEST_SRCS_ = $(GTEST_DIR)/src/*.cc $(GTEST_DIR)/src/*.h $(GTEST_HEADERS)
+
+# For simplicity and to avoid depending on Google Test's
+# implementation details, the dependencies specified below are
+# conservative and not optimized.  This is fine as Google Test
+# compiles fast and for ordinary users its source rarely changes.
+gtest-all.o : $(GTEST_SRCS_)
+	$(CXX) $(CPPFLAGS) -I$(GTEST_DIR) $(CXXFLAGS) -c \
+            $(GTEST_DIR)/src/gtest-all.cc
+
+gtest_main.o : $(GTEST_SRCS_)
+	$(CXX) $(CPPFLAGS) -I$(GTEST_DIR) $(CXXFLAGS) -c \
+            $(GTEST_DIR)/src/gtest_main.cc
+
+libgtest.a : gtest-all.o
+	$(AR) $(ARFLAGS) $@ $^
+
+libgtest_main.a : gtest-all.o gtest_main.o
+	$(AR) $(ARFLAGS) $@ $^
+
+# Builds a sample test.  A test should link with either gtest.a or
+# gtest_main.a, depending on whether it defines its own main()
+# function.
+
+sample1.o : $(USER_DIR)/sample1.cc $(USER_DIR)/sample1.h $(GTEST_HEADERS)
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(USER_DIR)/sample1.cc
+
+sample1_unittest.o : $(USER_DIR)/sample1_unittest.cc \
+                     $(USER_DIR)/sample1.h $(GTEST_HEADERS)
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(USER_DIR)/sample1_unittest.cc
+
+sample1_unittest : sample1.o sample1_unittest.o $(GTEST_LIBS)
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -L$(GTEST_LIB_DIR) -lgtest_main -lpthread $^ -o $@
--- a/third_party/googletest/googletest/scripts/test/Makefile
+++ b/third_party/googletest/googletest/scripts/test/Makefile
+# A Makefile for fusing Google Test and building a sample test against it.
+#
+# SYNOPSIS:
+#
+#   make [all]  - makes everything.
+#   make TARGET - makes the given target.
+#   make check  - makes everything and runs the built sample test.
+#   make clean  - removes all files generated by make.
+
+# Points to the root of fused Google Test, relative to where this file is.
+FUSED_GTEST_DIR = output
+
+# Paths to the fused gtest files.
+FUSED_GTEST_H = $(FUSED_GTEST_DIR)/gtest/gtest.h
+FUSED_GTEST_ALL_CC = $(FUSED_GTEST_DIR)/gtest/gtest-all.cc
+
+# Where to find the sample test.
+SAMPLE_DIR = ../../samples
+
+# Where to find gtest_main.cc.
+GTEST_MAIN_CC = ../../src/gtest_main.cc
+
+# Flags passed to the preprocessor.
+# We have no idea here whether pthreads is available in the system, so
+# disable its use.
+CPPFLAGS += -I$(FUSED_GTEST_DIR) -DGTEST_HAS_PTHREAD=0
+
+# Flags passed to the C++ compiler.
+CXXFLAGS += -g
+
+all : sample1_unittest
+
+check : all
+	./sample1_unittest
+
+clean :
+	rm -rf $(FUSED_GTEST_DIR) sample1_unittest *.o
+
+$(FUSED_GTEST_H) :
+	../fuse_gtest_files.py $(FUSED_GTEST_DIR)
+
+$(FUSED_GTEST_ALL_CC) :
+	../fuse_gtest_files.py $(FUSED_GTEST_DIR)
+
+gtest-all.o : $(FUSED_GTEST_H) $(FUSED_GTEST_ALL_CC)
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(FUSED_GTEST_DIR)/gtest/gtest-all.cc
+
+gtest_main.o : $(FUSED_GTEST_H) $(GTEST_MAIN_CC)
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(GTEST_MAIN_CC)
+
+sample1.o : $(SAMPLE_DIR)/sample1.cc $(SAMPLE_DIR)/sample1.h
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(SAMPLE_DIR)/sample1.cc
+
+sample1_unittest.o : $(SAMPLE_DIR)/sample1_unittest.cc \
+                     $(SAMPLE_DIR)/sample1.h $(FUSED_GTEST_H)
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(SAMPLE_DIR)/sample1_unittest.cc
+
+sample1_unittest : sample1.o sample1_unittest.o gtest-all.o gtest_main.o
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $^ -o $@
--- a/third_party/libxsmm/.gitignore
+++ b/third_party/libxsmm/.gitignore
-My Amplifier*
-VTune Amplifier Results
-libxsmm*-*
-libxsmm*_*
-opentuner.db
-bin/libxsmm_generator
-include/libxsmm_version.h
-include/libxsmm.f
-lib/libxsmm*
-lib/module
-ide/GPUCache
-ide/_vs*-*.bat
-ide/.vs
-ide/obj
-ide/r*ah
-samples2
-samples/*/bin
-samples/*/*.sln
-samples/*/*.dat
-samples/*/*.pdf
-samples/*/*.png
-inspector*
-licenses
-bazel-*
-python*
-html
-site
-bin
-tmp
-obj
-.couscous
-.vscode
-.state
-.make
-.vs
-threadsafety-*.txt
-malloc-trace-*.txt
-blas-trace-*.txt
-codecov-*.txt
-keywords.txt
-notes.txt
-err*.txt
-out*.txt
-log.txt
-_*.txt
-.env.sh
-.env_??????
-.tool_??????.sh
-.libxsmm_??????.*
-*.lastcodeanalysissucceeded
-*.amplxeproj
-*.advixeproj
-*.inspxeproj
-*.stackdump
-*.opensdf
-*.opendb
-*.VC.db
-*.dylib
-*.sarif
-*.docx
-*.user
-*.tlog
-*.gcno
-*.gcda
-*.gcov
-*.html
-*.iobj
-*.ipdb
-*.URL
-*.log
-*.suo
-*.exe
-*.zip
-*.pyc
-*.sdf
-*.ilk
-*.pdb
-*.vsp
-*.obj
-*.lib
-*.mod
-*.bin
-*.jit
-*.smm
-*.soa
-*.csr
-*.dll
-*.mhd
-*.out
-*.err
-*.so
-*.o
-*.a
-*.i
-*.s
-*.*~
--- a/third_party/libxsmm/.state
+++ b/third_party/libxsmm/.state
+"ABSDIR=/public$HOME/dgl/third_party/libxsmm\n"
+"ABSLIBS=0\n"
+"ALPHA=1\n"
+"AR=/usr/bin/gcc-ar\n"
+"ASIMD=0\n"
+"ASNEEDED=0\n"
+"AUTOPIN=0\n"
+"BETA=1\n"
+"BLAS_CLDFLAGS=-lm\n"
+"CACHE=1\n"
+"CACHELINE=64\n"
+"CC=gcc\n"
+"CC_NAME=gcc\n"
+"CC_VERSION=8.5.0\n"
+"CFLAGS=-fPIC -Wall -O2 -fopenmp-simd -funroll-loops -ftree-vectorize -fdata-sections -ffunction-sections -fvisibility=hidden -pthread\n"
+"COMMAND=/usr/bin/command\n"
+"COMPATIBLE=0\n"
+"CPUFLAGS_X86=fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid amd_dcm aperfmperf pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca sme sev sev_es\n"
+"CTARGET=-msse4.2\n"
+"CXXFLAGS=-fPIC -std=c++14 -Wall -O2 -fopenmp-simd -funroll-loops -ftree-vectorize -fdata-sections -ffunction-sections -fvisibility=hidden -fvisibility-inlines-hidden -pthread\n"
+"CXXLDFLAGS=-lc\n"
+"CXX_NAME=g++\n"
+"CXX_VERSION=8.5.0\n"
+"DBG=0\n"
+"FAT=0\n"
+"FCFLAGS=-fPIC -O2 -ftree-vectorize -fdata-sections -ffunction-sections\n"
+"FLD=gcc\n"
+"FLUSH=stdbuf -o0 -e0\n"
+"FORTRAN=0\n"
+"GCC_VERSION=8.5.0\n"
+"GLIBC=1\n"
+"ILP64=0\n"
+"INSTRUMENT=0\n"
+"INTRINSICS=1006\n"
+"IPO=0\n"
+"JITDUMP=0\n"
+"LD=gcc\n"
+"LDFLAGS=-Wl,--gc-sections -Wl,-z,relro,-z,now -lm -lrt -ldl\n"
+"LIBATOMIC=0\n"
+"LIBC=-lc\n"
+"LNKSOFT=1\n"
+"MAINTAINER=0\n"
+"MALLOC=0\n"
+"MIC=0\n"
+"MKL=0\n"
+"MNAME=x86_64\n"
+"OFFLOAD=0\n"
+"OMP=0\n"
+"OMPFLAG=-fopenmp\n"
+"OMPLIB=-L/usr/lib/gcc/x86_64-redhat-linux/8/ -lgomp\n"
+"OMPRT=gomp\n"
+"PERF=0\n"
+"PLATFORM=0\n"
+"PREFETCH=1\n"
+"SONAMELNK=2\n"
+"SPACES=0\n"
+"STATIC=1\n"
+"SYM=0\n"
+"THREADS=1\n"
+"THRESHOLD=0\n"
+"TRACE=0\n"
+"UNAME=Linux\n"
+"VISIBILITY=0\n"
+"WCHECK=0\n"
+"WERROR_CFLAG=-Werror\n"
+"WRAP=1\n"
+"XLD=g++\n"
+"\n"
--- a/third_party/libxsmm/.theme/main.html
+++ b/third_party/libxsmm/.theme/main.html
+{% extends "base.html" %}
+
+{% block site_meta %}
+  <meta name="google-site-verification" content="7G4Pmpl7BnEm6uQ_D8AqlhgPdu-H-MFo64tDR-A-n6c">
+  <meta name="google-site-verification" content="ehsbFL_ZVNx9dUSeI9tfsmVToAJGeUqNO4zrVyJ1NqU">
+  {{ super() }}
+{% endblock %}
+
+{% block footer %}
+  <hr>
+  {%- if config.copyright %}
+    <p>{{ config.copyright }}</p>
+  {%- endif %}
+{% endblock %}
--- a/third_party/libxsmm/bin/.make
+++ b/third_party/libxsmm/bin/.make
--- a/third_party/libxsmm/bin/libxsmm_gemm_generator
+++ b/third_party/libxsmm/bin/libxsmm_gemm_generator
--- a/third_party/libxsmm/documentation/libxsmm-dev.pptm
+++ b/third_party/libxsmm/documentation/libxsmm-dev.pptm
--- a/third_party/libxsmm/documentation/libxsmm_aux.md
+++ b/third_party/libxsmm/documentation/libxsmm_aux.md
+## Service Functions
+
+### Target Architecture<a name="getting-and-setting-the-target-architecture"></a>
+
+This functionality is available for the C and Fortran interface. There are [ID based](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_cpuid.h#L47) (same for C and Fortran) and string based functions to query the code path (as determined by the CPUID), or to set the code path regardless of the presented CPUID features. The latter may degrade performance if a lower set of instruction set extensions is requested, which can be still useful for studying the performance impact of different instruction set extensions.  
+**Note**: There is no additional check performed if an unsupported instruction set extension is requested, and incompatible JIT-generated code may be executed (unknown instruction signaled).
+
+```C
+int libxsmm_get_target_archid(void);
+void libxsmm_set_target_archid(int id);
+
+const char* libxsmm_get_target_arch(void);
+void libxsmm_set_target_arch(const char* arch);
+```
+
+Available code paths (IDs and corresponding strings):
+
+* LIBXSMM_TARGET_ARCH_GENERIC: "**generic**", "none", "0"
+* LIBXSMM_X86_GENERIC: "**x86**", "x64", "sse2"
+* LIBXSMM_X86_SSE3: "**sse3**"
+* LIBXSMM_X86_SSE42: "**wsm**", "nhm", "sse4", "sse4_2", "sse4.2"
+* LIBXSMM_X86_AVX: "**snb**", "avx"
+* LIBXSMM_X86_AVX2: "**hsw**", "avx2"
+* LIBXSMM_X86_AVX512_MIC: "**knl**", "mic"
+* LIBXSMM_X86_AVX512_KNM: "**knm**"
+* LIBXSMM_X86_AVX512_CORE: "**skx**", "skl", "avx3", "avx512"
+* LIBXSMM_X86_AVX512_CLX: "**clx**"
+* LIBXSMM_X86_AVX512_CPX: "**cpx**"
+* LIBXSMM_X86_AVX512_SPR: "**spr**"
+
+The **bold** names are returned by `libxsmm_get_target_arch` whereas `libxsmm_set_target_arch` accepts all of the above strings (similar to the environment variable LIBXSMM_TARGET).
+
+### Verbosity Level<a name="getting-and-setting-the-verbosity"></a>
+
+The [verbose mode](index.md#verbose-mode) (level of verbosity) can be controlled using the C or Fortran API, and there is an environment variable which corresponds to `libxsmm_set_verbosity` (LIBXSMM_VERBOSE).
+
+```C
+int libxsmm_get_verbosity(void);
+void libxsmm_set_verbosity(int level);
+```
+
+### Timer Facility
+
+Due to the performance oriented nature of LIBXSMM, timer-related functionality is available for the C and Fortran interface ([libxsmm_timer.h](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_timer.h#L37) and [libxsmm.f](https://github.com/hfp/libxsmm/blob/master/include/libxsmm.f#L32)). The timer is used in many of the [code samples](https://github.com/hfp/libxsmm/tree/master/samples) to measure the duration of executing a region of the code. The timer is based on a monotonic clock tick, which uses a platform-specific resolution. The counter may rely on the time stamp counter instruction (RDTSC), which is not necessarily counting CPU cycles (reasons are out of scope in this context). However, `libxsmm_timer_ncycles` delivers raw clock ticks (RDTSC).
+
+```C
+typedef unsigned long long libxsmm_timer_tickint;
+libxsmm_timer_tickint libxsmm_timer_tick(void);
+double libxsmm_timer_duration(
+  libxsmm_timer_tickint tick0,
+  libxsmm_timer_tickint tick1);
+libxsmm_timer_tickint libxsmm_timer_ncycles(
+  libxsmm_timer_tickint tick0,
+  libxsmm_timer_tickint tick1);
+```
+
+### User-Data Dispatch
+
+To register a user-defined key-value pair with LIBXSMM's fast key-value store, the key must be binary reproducible. Structured key-data (`struct` or `class` type which can be padded in a compiler-specific fashion) must be completely cleared, i.e., all gaps may be zero-filled before initializing data members (`memset(&mykey, 0, sizeof(mykey))`). This is because some compilers can leave padded data uninitialized, which breaks binary reproducible keys, hence the flow is: claring heterogeneous keys (struct), initialization (members), and registration. The size of the key is arbitrary but limited to LIBXSMM_DESCRIPTOR_MAXSIZE (96 Byte), and the size of the value can be of an arbitrary size. The given value is copied and may be initialized at registration-time or when dispatched. Registered data is released at program termination but can be manually unregistered and released (`libxsmm_xrelease`), e.g., to register a larger value for an existing key.
+
+```C
+void* libxsmm_xregister(const void* key, size_t key_size, size_t value_size, const void* value_init);
+void* libxsmm_xdispatch(const void* key, size_t key_size);
+```
+
+The Fortran interface is designed to follow the same flow as the <span>C&#160;language</span>: <span>(1)&#160;</span>`libxsmm_xdispatch` is used to query the value, and <span>(2)&#160;if</span> the value is a NULL-pointer, it is registered per `libxsmm_xregister`. Similar to C (`memset`), structured key-data must be zero-filled (`libxsmm_xclear`) even when followed by an element-wise initialization. A key based on a contiguous array has no gaps by definition and it is enough to initialize the array elements. A [Fortran example](https://github.com/hfp/libxsmm/blob/master/samples/utilities/dispatch/dispatch_udt.f) is given as part of the [Dispatch Microbenchmark](https://github.com/hfp/libxsmm/tree/master/samples/utilities/dispatch).
+
+```Fortran
+FUNCTION libxsmm_xregister(key, keysize, valsize, valinit)
+  TYPE(C_PTR), INTENT(IN), VALUE :: key
+  TYPE(C_PTR), INTENT(IN), VALUE, OPTIONAL :: valinit
+  INTEGER(C_INT), INTENT(IN) :: keysize, valsize
+  TYPE(C_PTR) :: libxsmm_xregister
+END FUNCTION
+
+FUNCTION libxsmm_xdispatch(key, keysize)
+  TYPE(C_PTR), INTENT(IN), VALUE :: key
+  INTEGER(C_INT), INTENT(IN) :: keysize
+  TYPE(C_PTR) :: libxsmm_xdispatch
+END FUNCTION
+```
+
+**Note**: This functionality can be used to, e.g., dispatch multiple kernels in one step if a code location relies on multiple kernels. This way, one can pay the cost of dispatch one time per task rather than according to the number of JIT-kernels used by this task. However, the functionality is not limited to multiple kernels but any data can be registered and queried. User-data dispatch uses the same implementation as regular code-dispatch.
+
+### Memory Allocation
+
+The C interface ([libxsmm_malloc.h](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_malloc.h)) provides functions for aligned memory one of which allows to specify the alignment (or to request an automatically selected alignment). The automatic alignment is also available with a `malloc` compatible signature. The size of the automatic alignment depends on a heuristic, which uses the size of the requested buffer.  
+**Note**: The function `libxsmm_free` must be used to deallocate buffers allocated by LIBXSMM's allocation functions.
+
+```C
+void* libxsmm_malloc(size_t size);
+void* libxsmm_aligned_malloc(size_t size, size_t alignment);
+void* libxsmm_aligned_scratch(size_t size, size_t alignment);
+void libxsmm_free(const volatile void* memory);
+int libxsmm_get_malloc_info(const void* m, libxsmm_malloc_info* i);
+int libxsmm_get_scratch_info(libxsmm_scratch_info* info);
+```
+
+The library exposes two memory allocation domains: <span>(1)&#160;default</span> memory allocation, and <span>(2)&#160;scratch</span> memory allocation. There are similar service functions for both domains that allow to customize the allocation and deallocation function. The "context form" even supports a user-defined "object", which may represent an allocator or any other external facility. To set the allocator of the default domain is analogous to setting the allocator of the scratch memory domain (shown below).
+
+```C
+int libxsmm_set_scratch_allocator(void* context,
+  libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn);
+int libxsmm_get_scratch_allocator(void** context,
+  libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn);
+```
+
+The scratch memory allocation is very effective and delivers a decent speedup over subsequent regular memory allocations. In contrast to the default allocator, a watermark for repeatedly allocated and deallocated buffers is established. The scratch memory domain is (arbitrarily) limited to <span>4&#160;GB</span> of memory which can be adjusted to a different number of Bytes (available per [libxsmm_malloc.h](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_malloc.h), and also per environment variable LIBXSMM_SCRATCH_LIMIT with optional "k|K", "m|M", "g|G" units, unlimited per "-1").
+
+```C
+void libxsmm_set_scratch_limit(size_t nbytes);
+size_t libxsmm_get_scratch_limit(void);
+```
+
+By establishing a pool of "temporary" memory, the cost of repeated allocation and deallocation cycles is avoided when the watermark is reached. The scratch memory is scope-oriented with a limited number of pools for buffers of different life-time or held for different threads. The [verbose mode](index.md#verbose-mode) with a verbosity level of at least two (LIBXSMM_VERBOSE=2) shows some statistics about the populated scratch memory.
+
+```bash
+Scratch: 173 MB (mallocs=5, pools=1)
+```
+
+To improve thread-scalability and to avoid frequent memory allocation/deallocation, the scratch memory allocator can be leveraged by [intercepting existing malloc/free calls](libxsmm_tune.md#intercepted-allocations).
+
+**Note**: be careful with scratch memory as it only grows during execution (in between `libxsmm_init` and `libxsmm_finalize` unless `libxsmm_release_scratch` is called). This is true even when `libxsmm_free` is (and should be) used!
+
+### Meta Image File I/O
+
+Loading and storing data (I/O) is normally out of LIBXSMM's scope. However, comparing results (correctness) or writing files for visual inspection is clearly desired. This is particularly useful for the DNN domain. The MHD library domain provides support for the Meta Image File format (MHD). Tools such as [ITK-SNAP](http://itksnap.org/) or [ParaView](https://www.paraview.org/) can be used to inspect, compare, and modify images (even beyond two-dimensional images).
+
+Writing an image is per `libxsmm_mhd_write`, and loading an image is split in two stages: <span>(1)&#160;</span>`libxsmm_mhd_read_header`, and <span>(2)&#160;</span>`libxsmm_mhd_read`. The first step allows to allocate a properly sized buffer, which is then used to obtain the data per `libxsmm_mhd_read`. When reading data, an on-the-fly type conversion is supported. Further, data that is already in memory can be compared against file-data without allocating memory or reading this file into memory.
+
+To load an image from a familiar format (JPG, PNG, etc.), one may save the raw data using for instance [IrfanView](http://www.irfanview.com/) and rely on a "header-only" MHD-file (plain text). This may look like:
+
+```ini
+NDims = 2
+DimSize = 202 134
+ElementType = MET_UCHAR
+ElementNumberOfChannels = 1
+ElementDataFile = mhd_image.raw
+```
+
+In the above case, a single channel (gray-scale) 202x134-image is described with pixel data stored separately (`mhd_image.raw`). Multi-channel images are expected to interleave the pixel data. The pixel type is per `libxsmm_mhd_elemtype` ([libxsmm_mhd.h](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_mhd.h#L38)).
+
+### Thread Synchronization
+
+LIBXSMM comes with a number of light-weight abstraction layers (macro and API-based), which are distinct from the internal API (include files in [src](https://github.com/hfp/libxsmm/tree/master/src) directory) and that are exposed for general use (and hence part of the [include](https://github.com/hfp/libxsmm/tree/master/include) directory).
+
+The synchronization layer is mainly based on macros: LIBXSMM_LOCK_\* provide spin-locks, mutexes, and reader-writer locks (LIBXSMM_LOCK_SPINLOCK, LIBXSMM_LOCK_MUTEX, and LIBXSMM_LOCK_RWLOCK respectively). Usually the spin-lock is also named LIBXSMM_LOCK_DEFAULT. The implementation is intentionally based on OS-native primitives unless LIBXSMM is reconfigured (per LIBXSMM_LOCK_SYSTEM) or built using `make OMP=1` (using OpenMP inside of the library is not recommended). The life-cycle of a lock looks like:
+
+```C
+/* attribute variable and lock variable */
+LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_LOCK_DEFAULT) attr;
+LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_DEFAULT) lock;
+/* attribute initialization */
+LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_LOCK_DEFAULT, &attr);
+/* lock initialization per initialized attribute */
+LIBXSMM_LOCK_INIT(LIBXSMM_LOCK_DEFAULT, &lock, &attr);
+/* the attribute can be destroyed */
+LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_LOCK_DEFAULT, &attr);
+/* lock destruction (usage: see below/next code block) */
+LIBXSMM_LOCK_DESTROY(LIBXSMM_LOCK_DEFAULT, &lock);
+```
+
+Once the lock is initialized (or an array of locks), it can be exclusively locked or try-locked, and released at the end of the locked section (LIBXSMM_LOCK_ACQUIRE, LIBXSMM_LOCK_TRYLOCK, and LIBXSMM_LOCK_RELEASE respectively):
+
+```C
+LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK_DEFAULT, &lock);
+/* locked code section */
+LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK_DEFAULT, &lock);
+```
+
+If the lock-kind is LIBXSMM_LOCK_RWLOCK, non-exclusive a.k.a. shared locking allows to permit multiple readers (LIBXSMM_LOCK_ACQREAD, LIBXSMM_LOCK_TRYREAD, and LIBXSMM_LOCK_RELREAD) if the lock is not acquired exclusively (see above). An attempt to only read-lock anything else but an RW-lock is an exclusive lock (see above).
+
+```C
+if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) ==
+    LIBXSMM_LOCK_TRYREAD(LIBXSMM_LOCK_RWLOCK, &rwlock))
+{ /* locked code section */
+  LIBXSMM_LOCK_RELREAD(LIBXSMM_LOCK_RWLOCK, &rwlock);
+}
+```
+
+Locking different sections for read (LIBXSMM_LOCK_ACQREAD, LIBXSMM_LOCK_RELREAD) and write (LIBXSMM_LOCK_ACQUIRE, LIBXSMM_LOCK_RELEASE) may look like:
+
+```C
+LIBXSMM_LOCK_ACQREAD(LIBXSMM_LOCK_RWLOCK, &rwlock);
+/* locked code section: only reads are performed */
+LIBXSMM_LOCK_RELREAD(LIBXSMM_LOCK_RWLOCK, &rwlock);
+
+LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK_RWLOCK, &rwlock);
+/* locked code section: exclusive write (no R/W) */
+LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK_RWLOCK, &rwlock);
+```
+
+For a lock not backed by an OS level primitive (fully featured lock), the synchronization layer also provides a simple lock based on atomic operations:
+
+```C
+static union { char pad[LIBXSMM_CACHELINE]; volatile LIBXSMM_ATOMIC_LOCKTYPE state; } lock;
+LIBXSMM_ATOMIC_ACQUIRE(&lock.state, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED);
+/* locked code section */
+LIBXSMM_ATOMIC_RELEASE(&lock.state, LIBXSMM_ATOMIC_RELAXED);
+```
+
+In addition to the LIBXSMM_LOCK_\* macros or LIBXSMM_ATOMIC_LOCKTYPE, API-based lock primitives are also available (libxsmm_mutex_\*, and libxsmm_rwlock_\*). However, the underlying implementation of the latter is experimental.
+
--- a/third_party/libxsmm/documentation/libxsmm_be.md
+++ b/third_party/libxsmm/documentation/libxsmm_be.md
+## Backend
+
+### Code Generator (JIT)
+
+There can be situations in which it is up-front not clear which problem-sizes will be needed when running an application. To leverage LIBXSMM's high-performance kernels, the library implements a JIT (Just-In-Time) code generation backend which generates the requested kernels on the fly (in-memory). This is accomplished by emitting the corresponding byte-code directly into an executable buffer. The actual JIT code is generated per the CPUID flags, and therefore does not rely on the code path selected when building the library. In the current implementation, some limitations apply to the JIT backend specifically:
+
+1. To stay agnostic to any threading model used, Pthread mutexes are guarding the updates of the JIT'ted code cache (link line with `-lpthread` is required); building with OMP=1 employs an OpenMP critical section as an alternative locking mechanism.
+2. There is limited support for the Windows calling convention (only kernels without prefetch signature).
+
+The JIT backend can also be disabled at build time (`make JIT=0`) as well as at runtime (`LIBXSMM_TARGET=0`, or anything prior to <span>Intel&#160;AVX</span>). The latter is an environment variable which allows to set a code path independent of the CPUID (<span>LIBXSMM_TARGET=0&#124;1&#124;sse&#124;snb&#124;hsw&#124;knl&#124;knm&#124;skx&#124;clx&#124;cpx&#124;spr</span>). Please note that LIBXSMM_TARGET cannot enable the JIT backend if it was disabled at build time (JIT=0).
+
+One can use the afore mentioned THRESHOLD parameter to control the matrix sizes for which the JIT compilation will be automatically performed. However, explicitly requested kernels (by calling `libxsmm_?mmdispatch`) fall not under a threshold for the problem-size. In any case, JIT code generation can be used for accompanying statically generated code.
+
+### Generator Driver
+
+In rare situations, it might be useful to directly incorporate generated C code (with inline assembly regions). This is accomplished by invoking a driver program (with certain command line arguments).
+
+**Note**: The stand-alone generator-driver is considered legacy (deprecated). Associated functionality may be removed and future instruction set extensions may not be addressed with printed assembly code. The cost of dispatching JIT-code for every code region of an application, and for every visit of such region, can be amortized in several ways and without dispensing JIT-generated code. Dispatching [multiple kernels at once](libxsmm_aux.md#user-data-dispatch) or (most effectively) tabulating JIT'ted function pointers manually, can elleviate or remove first-time code generation and (more important) the cost of subsequently dispatching kernels (when code was already JIT-generated).
+
+The generator driver program is usually built as part of LIBXSMM's build process, but also available as a separate build target:
+
+```bash
+make generator
+bin/libxsmm_gemm_generator
+```
+
+The code generator driver program accepts the following arguments:
+
+1. Select: dense, dense_asm, sparse, sparse_csr, or sparse_csr_reg
+2. Filename of a file to append to
+3. Routine name to be created
+4. M parameter
+5. N parameter
+6. K parameter
+7. LDA (0 indicates A is sparse if 1st arg. is "sparse*")
+8. LDB (0 indicates B is sparse if 1st arg. is "sparse*")
+9. LDC parameter
+10. Alpha (1)
+11. Beta: (0 or 1)
+12. Alignment override for A (1 auto, 0 unalignment)
+13. Alignment override for C (1 auto, 0 unalignment)
+14. Architecture (noarch, wsm, snb, hsw, knl, knm, skx, clx, cpx)
+15. Prefetch strategy, see below (only nopf or pfsigonly for "sparse*")
+16. SP (single-precision), DP (double-recision), or I16 (only "dense*")
+17. CSC file in Matrix market format (only if 1st arg. is "sparse*").
+
+<a name="prefetch-strategy"></a>The prefetch strategy can be:
+
+1. "nopf": data is not prefetched, just three arguments: A, B, and C
+2. "pfsigonly": no prefetches, kernel signature: A, B, C, A', B', and C'
+3. "BL2viaC": uses accesses to C to prefetch B'
+4. "AL2": uses accesses to A to prefetch A
+5. "curAL2": prefetches current A ahead in the kernel
+6. "AL2_BL2viaC": combines AL2 and BL2viaC
+7. "curAL2_BL2viaC": combines curAL2 and BL2viaC
+
+Here are some examples of invoking the driver program:
+
+```bash
+bin/libxsmm_gemm_generator dense foo.c foo 16 16 16 32 32 32 1 1 1 1 hsw nopf DP
+bin/libxsmm_gemm_generator dense_asm foo.c foo 16 16 16 32 32 32 1 1 1 1 knl AL2_BL2viaC DP
+bin/libxsmm_gemm_generator sparse foo.c foo 16 16 16 32 0 32 1 1 1 1 hsw nopf DP bar.csc
+```
+
+Please note, there are additional examples given in samples/generator and samples/seissol.
+
+### Development Concepts
+
+The low-level code generator is hosted by a single translation unit ([src/generator_x86_instructions.c](https://github.com/hfp/libxsmm/blob/master/src/generator_x86_instructions.h)). The code generator emits instructions as enumerated in [src/generator_common.h](https://github.com/hfp/libxsmm/blob/master/src/generator_common.h). A kernel then is a buffered stream of instructions in either binary/encoded or textual form. The latter is leveraged by stand-alone generator drivers that can print <span>C&#160;functions</span> with an assembly section (inline). A [generator driver](#generator-driver) may exists for some of LIBXSMM's function domains. Please note that emitting the textual form is not needed to inspect the emitted code since the binary encoded form can be easily disassembled ([objdump](index.md#objdump)).
+
+The binary encoded form is directly suitable for execution by casting the code-buffer into a function-pointer of the corresponding signature. It is advised to rely on LIBXSMM's internal memory allocation routines to acquire an executable buffer (see libxsmm_malloc_flags, libxsmm_xmalloc, and libxsmm_malloc_attrib in [src/libxsmm_main.h](https://github.com/hfp/libxsmm/blob/master/src/libxsmm_main.h)). This ensures correct behavior in security-hardened environments. As a bonus, [profiler support](libxsmm_prof.md) for the emitted code is enabled transparently.
+
+To debug the JIT'ted code, GNU GDB can be used to disassemble a given memory address (`disas address,+length`). Having the code disassembled side-by-side (while debugging) helps to look ahead and to have some orientation. For the latter, [objdump](index.md#objdump) can be used to acquire the source code (assembly) along with hexadecimal line numbers (length). The offset position (for GDB's disas) directly corresponds to objectdump's line numbers.
+
+The kernel development is much like assembly programming, except that an API is used to emit instructions. For further reference, some existing source code for building kernels can be inspected (e.g., matcopy). This may help to capture the concept of mapping registers (basically a table to avoid hard-coding register names).
+
--- a/third_party/libxsmm/documentation/libxsmm_compat.md
+++ b/third_party/libxsmm/documentation/libxsmm_compat.md
+## Linux
+
+All Linux distributions are meant to be fully supported (please [report](https://github.com/hfp/libxsmm/issues/new) any compatibility issue). A shared library (`STATIC=0`) necessarily implies some performance hit when accessing thread-local memory (contended multicore execution). The GNU Compiler Collection prior to v5.1 may imply performance hits in some CPUID-dispatched code paths (non-JIT).
+
+> In case of outdated Binutils, compilation can fail to assemble code that originates from code sections using Intrinsics (see issue [#170](https://github.com/hfp/libxsmm/issues/170) and [#212](https://github.com/hfp/libxsmm/issues/212#issuecomment-394620082)). To resolve the problem, please use `INTRINSICS=1` along with the desired target e.g., `AVX=3 MIC=0`, or `AVX=2`.
+
+## CRAY
+
+In addition to the regular Linux support, The CRAY Compiling Environment (CCE) is supported: Intel Compiler as well as the GNU Compiler Collection are detected even when invoked per CCE, and the CRAY compiler is likely configured to build for the architecture of the compute nodes and hence the compiler is sufficiently treated without specific build flags (`COMPATIBLE=1` is implicitly set). The CCE may suppress to build a shared library (`STATIC=0`), which also affects the TRACE facility (requires dynamic linkage even for static archives).
+
+```bash
+make CXX=CC CC=cc FC=ftn
+```
+
+The compatibility settings imply minor issues when using the CRAY compiler: full control and [customization](http://libxsmm.readthedocs.io/libxsmm_tune/) is not implemented, enabling symbols (`SYM=1`) appears to imply an unoptimized debug-build (due to the `-g` flag being present). Some sample codes/benchmarks enable symbols but are meant to not enable debug-code. The LIBXSMM library however is built without symbols by default.
+
+## Windows
+
+### Microsoft Windows
+
+Microsoft Windows is [supported](https://github.com/hfp/libxsmm/wiki/Q&A#what-operating-systems-are-covered-by-libxsmm-and-what-about-microsoft-windows) using the Microsoft Visual Studio environment (no `make`). It is advised to review the build settings. However, the following configurations are available: `debug`, `release`, and release mode with `symbols`. JIT-code generation is enabled but limited to the MM domain (GEMM kernels and matcopy kernels; no transpose kernels). GEMM kernels with prefetch signature remain as non-prefetch kernels i.e., prefetch locations are ignored due to the effort of fully supporting the Windows calling convention. As a workaround and to properly preserve caller-state, each JIT-kernel call may be wrapped by an own function.
+
+### Cygwin
+
+Cygwin (non-MinGW) is fully supported. Please note, that all limitations of Microsoft Windows apply.
+
+```bash
+make
+```
+
+LIBXSMM can be built as a static library as well as a dynamic link library (STATIC=0).
+
+### MinGW/Cygwin
+
+This is about the Cygwin-hosted bits of MinGW. The `-fno-asynchronous-unwind-tables` compiler flag is automatically applied. Please note, that all limitations of Microsoft Windows apply.
+
+```bash
+make \
+  CXX=x86_64-w64-mingw32-g++ \
+  CC=x86_64-w64-mingw32-gcc \
+  FC=x86_64-w64-mingw32-gfortran
+```
+
+To run tests, `BLAS=0` may be supplied (since Cygwin does not seem to provide BLAS-bits for the MinGW part). However, this may be different for "native" MinGW, or can be fixed by supplying a BLAS library somehow else.
+
+### MinGW
+
+This is about the "native" MinGW environment. Please note, there is the original [MinGW](https://mingw.osdn.io/) as well as a [fork](http://mingw-w64.org/) (made in 2007). Both of which can target Windows 64-bit. Here, the [MSYS2 installer](https://www.msys2.org/) (scroll down on that page to see the full installation instructions) has been used (see the [details](https://github.com/msys2/msys2/wiki/MSYS2-installation) on how to install missing packages).
+
+```bash
+pacman -S msys/make msys/python msys/diffutils \
+  mingw64/mingw-w64-x86_64-gcc mingw64/mingw-w64-x86_64-gcc-fortran \
+  mingw64/mingw-w64-x86_64-openblas
+```
+
+Similar to Cygwin/MinGW, the `-fno-asynchronous-unwind-tables` flag is automatically applied.
+
+```bash
+make
+```
+
+LIBXSMM can be built as a static library as well as a dynamic link library (`STATIC=0`).
+
+## Apple macOS
+
+LIBXSMM for macOS (OSX) is fully supported (i.e., it qualifies a release). The default is to rely on Apple's Clang based (platform-)compiler ("gcc"). However, the actual GCC as well as the Intel Compiler for macOS can be used.
+
+## FreeBSD
+
+LIBXSMM is occasionally tested under FreeBSD. For libxsmmext, it is necessary to install OpenMP (`sudo pkg install openmp`).
+
+```bash
+bash
+gmake
+```
+An attempt to run the [tests](https://github.com/hfp/libxsmm/wiki/Validation) may ask for a LAPACK/BLAS installation (unless `BLAS=0` is given). Both, Netlib BLAS (reference) and OpenBLAS are available (in case of linker error due to the GNU Fortran runtime library, one can try `gmake CXX=g++7 CC=gcc7 FC=gfortran7` i.e., select a consistent tool chain and adjust `LD_LIBRARY_PATH` accordingly e.g., `/usr/local/lib/gcc7`).
+
+## PGI Compiler
+
+The PGI Compiler&#160;2019 (and later) is supported. Earlier versions were only occasionally tested and automatically enabled the `COMPATIBLE=1` and `INTRINSIC=0` settings. Still, atomic builtins seem incomplete (at least with `pgcc`) hence LIBXSMM built with PGI Compiler is not fully thread-safe (tests/threadsafety can fail). Support for GNU's libatomic has been incorporated mainly for PGI but is also missing built-in compiler support hence supposedly atomic operations are mapped to normal (non-atomic) code sequences (`LIBXSMM_SYNC_SYSTEM`).
+
+```bash
+make CXX=pgc++ CC=pgcc FC=pgfortran
+```
+
+### ARM AArch64
+
+This section is not strictly about compiler compatibility but rather about AArch64 (v8.1) being supported, which practically covers the baseline ARM 64-bit architecture from embedded and mobile to supercomputers. The build and installation process of LIBXSMM is the same as for Intel Architecture (IA) and the library can be natively compiled or cross-compiled. The latter for instance looks like:
+
+```bash
+make PLATFORM=1 AR=aarch64-linux-gnu-ar \
+  FC=aarch64-linux-gnu-gfortran \
+  CXX=aarch64-linux-gnu-g++ \
+  CC=aarch64-linux-gnu-gcc
+```
+
+**Note**: Apple&#160;M1 is supported but JIT code generation may fail due to macOS&#160;11 ("Big Sur"). LIBXSMM does not currently support macOS&#160;11.x (regardless of ARM or Intel Architecture).
--- a/third_party/libxsmm/documentation/libxsmm_dl.md
+++ b/third_party/libxsmm/documentation/libxsmm_dl.md
+## Deep Neural Networks
+
+To achieve best performance with small convolutions for CNN on SIMD architectures, a specific data layout must be used. As this layout depends on several architectural parameters, the goal of LIBXSMM's interface is to hide this complexity from the user by providing copy-in and copy-out routines. This happens using opaque data types, which themselves are later bound to a convolution operation.
+
+The interface is available for C. There is a collection of code samples ([samples/deeplearning](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning)) available including a light-weight [framework for deep learning (GXM)](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/gxm), and samples with focus on [Convolutional Deep Neural Networks (DNNs)](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/cnnlayer), or [LSTM cells](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/lstmdriver), etc. The general concept of the CNN interface is circled around a few types: `libxsmm_dnn_layer`, `libxsmm_dnn_buffer`, `libxsmm_dnn_bias`, and `libxsmm_dnn_filter`. A handle of such a type is always setup by calling a create-function.
+
+```C
+/** Simplified LIBXSMM types which are needed to create a handle. */
+
+/** Structure which describes the input and output of data (DNN). */
+typedef struct libxsmm_dnn_conv_desc {
+  int N;                                    /* number of images in mini-batch */
+  int C;                                    /* number of input feature maps */
+  int H;                                    /* height of input image */
+  int W;                                    /* width of input image */
+  int K;                                    /* number of output feature maps */
+  int R;                                    /* height of filter kernel */
+  int S;                                    /* width of filter kernel */
+  int u;                                    /* vertical stride */
+  int v;                                    /* horizontal stride */
+  int pad_h;                                /* height of logical rim padding to input
+                                               for adjusting output height */
+  int pad_w;                                /* width of logical rim padding to input
+                                               for adjusting output width */
+  int pad_h_in;                             /* height of zero-padding in input buffer,
+                                               must equal to pad_h for direct conv */
+  int pad_w_in;                             /* width of zero-padding in input buffer,
+                                               must equal to pad_w for direct conv */
+  int pad_h_out;                            /* height of zero-padding in output buffer */
+  int pad_w_out;                            /* width of zero-padding in output buffer */
+  int threads;                              /* number of threads to use when running
+                                               convolution */
+  libxsmm_dnn_datatype datatype;            /* datatypes use for all input and outputs */
+  libxsmm_dnn_tensor_format buffer_format;  /* format which is for buffer buffers */
+  libxsmm_dnn_tensor_format filter_format;  /* format which is for filter buffers */
+  libxsmm_dnn_conv_algo algo;               /* convolution algorithm used */
+  libxsmm_dnn_conv_option options;          /* additional options */
+  libxsmm_dnn_conv_fuse_op fuse_ops;        /* used ops into convolutions */
+} libxsmm_dnn_conv_desc;
+
+/** Type of algorithm used for convolutions. */
+typedef enum libxsmm_dnn_conv_algo {
+  /** let the library decide */
+  LIBXSMM_DNN_CONV_ALGO_AUTO,   /* ignored for now */
+  /** direct convolution. */
+  LIBXSMM_DNN_CONV_ALGO_DIRECT
+} libxsmm_dnn_conv_algo;
+
+/** Denotes the element/pixel type of an image/channel. */
+typedef enum libxsmm_dnn_datatype {
+  LIBXSMM_DNN_DATATYPE_F32,
+  LIBXSMM_DNN_DATATYPE_I32,
+  LIBXSMM_DNN_DATATYPE_I16,
+  LIBXSMM_DNN_DATATYPE_I8
+} libxsmm_dnn_datatype;
+
+libxsmm_dnn_layer* libxsmm_dnn_create_conv_layer(
+  libxsmm_dnn_conv_desc conv_desc, libxsmm_dnn_err_t* status);
+libxsmm_dnn_err_t libxsmm_dnn_destroy_conv_layer(
+  const libxsmm_dnn_layer* handle);
+```
+
+A sample call looks like (without error checks):
+
+```C
+/* declare LIBXSMM variables */
+libxsmm_dnn_conv_desc conv_desc;
+libxsmm_dnn_err_t status;
+libxsmm_dnn_layer* handle;
+/* setting conv_desc values.... */
+conv_desc.N = ...
+/* create handle */
+handle = libxsmm_dnn_create_conv_layer(conv_desc, &status);
+```
+
+Next activation and filter buffers need to be linked, initialized and bound to the handle. Afterwards the convolution can be executed in a threading environment of choice (error checks are omitted for brevity):
+
+```C
+float *input, *output, *filter;
+libxsmm_dnn_buffer* libxsmm_reg_input;
+libxsmm_dnn_buffer* libxsmm_reg_output;
+libxsmm_dnn_filter* libxsmm_reg_filter;
+
+/* allocate data */
+input = (float*)libxsmm_aligned_malloc(...);
+output = ...;
+
+/* link data to buffers */
+libxsmm_reg_input = libxsmm_dnn_link_buffer(  libxsmm_handle, LIBXSMM_DNN_INPUT, input,
+                                              LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
+libxsmm_reg_output = libxsmm_dnn_link_buffer( libxsmm_handle, LIBXSMM_DNN_OUTPUT, output,
+                                              LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
+libxsmm_reg_filter = libxsmm_dnn_link_filter( libxsmm_handle, LIBXSMM_DNN_FILTER, filter,
+                                              LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
+
+/* copy in data to LIBXSMM format: naive format is: */
+/* (mini-batch)(number-featuremaps)(featuremap-height)(featuremap-width) for layers, */
+/* and the naive format for filters is: */
+/* (number-output-featuremaps)(number-input-featuremaps)(kernel-height)(kernel-width) */
+libxsmm_dnn_copyin_buffer(libxsmm_reg_input, (void*)naive_input, LIBXSMM_DNN_TENSOR_FORMAT_NCHW);
+libxsmm_dnn_zero_buffer(libxsmm_reg_output);
+libxsmm_dnn_copyin_filter(libxsmm_reg_filter, (void*)naive_filter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS);
+
+/* bind layer to handle */
+libxsmm_dnn_bind_input_buffer(libxsmm_handle, libxsmm_reg_input, LIBXSMM_DNN_REGULAR_INPUT);
+libxsmm_dnn_bind_output_buffer(libxsmm_handle, libxsmm_reg_output, LIBXSMM_DNN_REGULAR_OUTPUT);
+libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_reg_filter, LIBXSMM_DNN_REGULAR_FILTER);
+
+/* allocate and bind scratch */
+scratch = libxsmm_aligned_scratch(libxsmm_dnn_get_scratch_size(
+  libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status), 2097152);
+libxsmm_dnn_bind_scratch(libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, scratch);
+
+/* run the convolution */
+#pragma omp parallel
+{
+  libxsmm_dnn_convolve_st(libxsmm_handle, LIBXSMM_DNN_CONV_KIND_FWD, 0,
+    omp_get_thread_num(), omp_get_num_threads());
+}
+
+/* copy out data */
+libxsmm_dnn_copyout_buffer(libxsmm_output, (void*)naive_libxsmm_output,
+  LIBXSMM_DNN_TENSOR_FORMAT_NCHW);
+
+/* clean up */
+libxsmm_dnn_release_scratch(...);
+libxsmm_dnn_release_buffer(...);
+...
+libxsmm_dnn_destroy_buffer(...);
+...
+libxsmm_dnn_destroy_conv_layer(...);
+```
+
--- a/third_party/libxsmm/documentation/libxsmm_fortran.md
+++ b/third_party/libxsmm/documentation/libxsmm_fortran.md
+Title: LIBXSMM
+project: LIBXSMM
+author: Intel Corporation
+summary: Library targeting Intel Architecture for specialized matrix operations.
+project_github: https://github.com/hfp/libxsmm
+project_download: https://github.com/hfp/libxsmm/releases/latest
+favicon: ../.theme/img/favicon.png
+css: ../.theme/ford.css
+output_dir: ../html
+src_dir: ../include
+search: true
+page_dir: .
+
+Library targeting Intel Architecture for specialized matrix operations: [libxsmm.readthedocs.io/](https://libxsmm.readthedocs.io/)