[Config] Enable libxsmm by default for AVX cpu (#5165)

* Enable AVX by default * Fix linting errors * Fix win64 build (libxsmm not linked) Libxsmm on Win64 is not linked, should be disabled by default * Fix clang format issues * Change lower supported cpu version to LIBXSMM_X86_AVX2 Change lower supported cpu version to LIBXSMM_X86_AVX2 to address https://github.com/dmlc/dgl/issues/3459 issue * Fix unit test Remove assumption that libxsmm is enabled in the config by default (only true for intel CPUs with AVX2 instructions) --------- Co-authored-by: Ubuntu <ubuntu@ip-172-31-15-137.us-west-2.compute.internal> Co-authored-by: Quan (Andy) Gan <coin2028@hotmail.com>

[Config] Enable libxsmm by default for AVX cpu (#5165)
* Enable AVX by default * Fix linting errors * Fix win64 build (libxsmm not linked) Libxsmm on Win64 is not linked, should be disabled by default * Fix clang format issues * Change lower supported cpu version to LIBXSMM_X86_AVX2 Change lower supported cpu version to LIBXSMM_X86_AVX2 to address https://github.com/dmlc/dgl/issues/3459 issue * Fix unit test Remove assumption that libxsmm is enabled in the config by default (only true for intel CPUs with AVX2 instructions) --------- Co-authored-by: Ubuntu <ubuntu@ip-172-31-15-137.us-west-2.compute.internal> Co-authored-by: Quan (Andy) Gan <coin2028@hotmail.com>
87fb7ed0 · Daniil Sizov · GitHub · 8d99d30a · 87fb7ed0 · 87fb7ed0
Unverified Commit 87fb7ed0 authored Mar 15, 2023 by Daniil Sizov Committed by GitHub Mar 16, 2023
7 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,6 @@ endif()
 # Alernatively, use cmake -DOPTION=VALUE through command-line.
 dgl_option(USE_CUDA "Build with CUDA" OFF)
 dgl_option(USE_OPENMP "Build with OpenMP" ON)
-dgl_option(USE_AVX "Build with AVX optimization" OFF)
 dgl_option(USE_LIBXSMM "Build with LIBXSMM library optimization" ON)
 dgl_option(USE_TVM "Build with TVM kernels" OFF)
 dgl_option(BUILD_CPP_TEST "Build cpp unittest executables" OFF)
@@ -103,17 +102,11 @@ if(USE_OPENMP)
  message(STATUS "Build with OpenMP.")
 endif(USE_OPENMP)
-if(USE_AVX)
+if(USE_LIBXSMM)
-  if(USE_LIBXSMM)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_LIBXSMM -DDGL_CPU_LLC_SIZE=40000000")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_AVX -DUSE_LIBXSMM -DDGL_CPU_LLC_SIZE=40000000")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_LIBXSMM -DDGL_CPU_LLC_SIZE=40000000")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_AVX -DUSE_LIBXSMM -DDGL_CPU_LLC_SIZE=40000000")
+  message(STATUS "Build with LIBXSMM optimization.")
-    message(STATUS "Build with LIBXSMM optimization.")
+endif(USE_LIBXSMM)
-  else(USE_LIBXSMM)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_AVX")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_AVX")
-    message(STATUS "Build with AVX optimization.")
-  endif(USE_LIBXSMM)
-endif(USE_AVX)
 if ((NOT MSVC) AND USE_EPOLL)
  INCLUDE(CheckIncludeFile)

--- a/include/dgl/runtime/config.h
+++ b/include/dgl/runtime/config.h
@@ -22,8 +22,8 @@ class Config {
  bool IsLibxsmmAvailable() const;
 private:
-  Config() = default;
+  Config();
-  bool libxsmm_ = true;
+  bool libxsmm_;
 };
 }  // namespace runtime

--- a/src/array/cpu/spmm.h
+++ b/src/array/cpu/spmm.h
@@ -19,56 +19,14 @@
 #include "spmm_binary_ops.h"
 #if !defined(_WIN32)
-#ifdef USE_AVX
-#include "intel/cpu_support.h"
 #ifdef USE_LIBXSMM
 #include "spmm_blocking_libxsmm.h"
 #endif  // USE_LIBXSMM
-#endif  // USE_AVX
 #endif  // _WIN32
 namespace dgl {
 namespace aten {
 namespace cpu {
-#if !defined(_WIN32)
-#ifdef USE_AVX
-/**
- * @brief CPU kernel of SpMM on Csr format using Xbyak.
- * @param cpu_spec JIT'ed kernel
- * @param bcast Broadcast information.
- * @param csr The Csr matrix.
- * @param X The feature on source nodes.
- * @param W The feature on edges.
- * @param O The result feature on destination nodes.
- * @note it uses node parallel strategy, different threads are responsible
- *       for the computation of different nodes. For each edge, it uses the
- *       JIT'ed kernel.
- */
-template <typename IdType, typename DType, typename Op>
-void SpMMSumCsrXbyak(
-    dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast,
-    const CSRMatrix& csr, const DType* X, const DType* W, DType* O) {
-  const bool has_idx = !IsNullArray(csr.data);
-  const IdType* indptr = csr.indptr.Ptr<IdType>();
-  const IdType* indices = csr.indices.Ptr<IdType>();
-  const IdType* edges = csr.data.Ptr<IdType>();
-  int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
-  runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
-    for (auto rid = b; rid < e; ++rid) {
-      const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
-      DType* out_off = O + rid * dim;
-      for (IdType j = row_start; j < row_end; ++j) {
-        const IdType cid = indices[j];
-        const IdType eid = has_idx ? edges[j] : j;
-        cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim);
-      }
-    }
-  });
-}
-#endif  // USE_AVX
-#endif  // _WIN32
 /**
 * @brief Naive CPU kernel of SpMM on Csr format.
 * @param cpu_spec JIT'ed kernel
@@ -142,7 +100,6 @@ void SpMMSumCsr(
    CHECK_NOTNULL(W);
  }
 #if !defined(_WIN32)
-#ifdef USE_AVX
 #ifdef USE_LIBXSMM
  const bool no_libxsmm = bcast.use_bcast ||
                          std::is_same<DType, double>::value ||
@@ -151,27 +108,12 @@ void SpMMSumCsr(
    SpMMSumCsrLibxsmm<IdType, DType, Op>(bcast, csr, ufeat, efeat, out);
  } else {
 #endif  // USE_LIBXSMM
-    typedef dgl::ElemWiseAddUpdate<Op> ElemWiseUpd;
-    /* Prepare an assembler kernel */
-    static std::unique_ptr<ElemWiseUpd> asm_kernel_ptr(
-        (dgl::IntelKernel<>::IsEnabled()) ? new ElemWiseUpd() : nullptr);
-    /* Distribute the kernel among OMP threads */
-    ElemWiseUpd* cpu_spec = (asm_kernel_ptr && asm_kernel_ptr->applicable())
-                                ? asm_kernel_ptr.get()
-                                : nullptr;
-    if (cpu_spec && bcast.out_len > 16 && !bcast.use_bcast) {
-      SpMMSumCsrXbyak<IdType, DType, Op>(cpu_spec, bcast, csr, X, W, O);
-    } else {
-#endif  // USE_AVX
 #endif  // _WIN32
-      SpMMSumCsrNaive<IdType, DType, Op>(bcast, csr, X, W, O);
+    SpMMSumCsrNaive<IdType, DType, Op>(bcast, csr, X, W, O);
 #if !defined(_WIN32)
-#ifdef USE_AVX
-    }
 #ifdef USE_LIBXSMM
  }
 #endif  // USE_LIBXSMM
-#endif  // USE_AVX
 #endif  // _WIN32
 }
@@ -272,7 +214,6 @@ void SpMMCmpCsr(
    CHECK_NOTNULL(argW);
  }
 #if !defined(_WIN32)
-#ifdef USE_AVX
 #ifdef USE_LIBXSMM
  const bool no_libxsmm = bcast.use_bcast ||
@@ -283,7 +224,6 @@ void SpMMCmpCsr(
        bcast, csr, ufeat, efeat, out, argu, arge);
  } else {
 #endif  // USE_LIBXSMM
-#endif  // USE_AVX
 #endif  // _WIN32
    runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
@@ -313,11 +253,9 @@ void SpMMCmpCsr(
      }
    });
 #if !defined(_WIN32)
-#ifdef USE_AVX
 #ifdef USE_LIBXSMM
  }
 #endif  // USE_LIBXSMM
-#endif  // USE_AVX
 #endif  // _WIN32
 }

--- a/src/array/cpu/spmm_blocking_libxsmm.h
+++ b/src/array/cpu/spmm_blocking_libxsmm.h
@@ -17,7 +17,6 @@
 #include <algorithm>
 #if !defined(_WIN32)
-#ifdef USE_AVX
 #ifdef USE_LIBXSMM
 #include <libxsmm.h>
 #include <unistd.h>
@@ -589,7 +588,6 @@ void SpMMCmpCsrLibxsmm(
 }  // namespace dgl
 #endif  // USE_LIBXSMM
-#endif  // USE_AVX
 #endif  // _WIN32
 #endif  // DGL_ARRAY_CPU_SPMM_BLOCKING_LIBXSMM_H_
--- a/src/runtime/config.cc
+++ b/src/runtime/config.cc
@@ -6,12 +6,23 @@
 #include <dgl/runtime/config.h>
 #include <dgl/runtime/registry.h>
+#include <libxsmm_cpuid.h>
 using namespace dgl::runtime;
 namespace dgl {
 namespace runtime {
+Config::Config() {
+#if !defined(_WIN32) && defined(USE_LIBXSMM)
+  int cpu_id = libxsmm_cpuid_x86();
+  // Enable libxsmm on AVX machines by default
+  libxsmm_ = LIBXSMM_X86_AVX2 <= cpu_id && cpu_id <= LIBXSMM_X86_ALLFEAT;
+#else
+  libxsmm_ = false;
+#endif
+}
 void Config::EnableLibxsmm(bool b) { libxsmm_ = b; }
 bool Config::IsLibxsmmAvailable() const { return libxsmm_; }

--- a/tests/cpp/test_spmm.cc
+++ b/tests/cpp/test_spmm.cc
 #if !defined(_WIN32)
-#ifdef USE_AVX
 #include <../src/array/cpu/spmm.h>
 #include <dgl/array.h>
 #include <gtest/gtest.h>
@@ -75,28 +74,14 @@ void Div(T* exp, T* out, T* lhs, T* rhs, int dim) {
 }
 template <class T>
-void CheckResult(T* exp, T* out, T* out_intel_kernel, int dim) {
+void CheckResult(T* exp, T* out, int dim) {
  for (int i = 0; i < dim; i++) {
    ASSERT_TRUE(exp[i] == out[i]);
-    if (out_intel_kernel != nullptr) {
-      ASSERT_TRUE(out[i] == out_intel_kernel[i]);
-    }
  }
 }
 }  // namespace
-template <class ElemWiseUpd>
-ElemWiseUpd* generic_ElemWiseUpd() {
-  static std::unique_ptr<ElemWiseUpd> asm_kernel_ptr(
-      (dgl::IntelKernel<>::IsEnabled()) ? new ElemWiseUpd() : nullptr);
-  ElemWiseUpd* cpu_spec = (asm_kernel_ptr && asm_kernel_ptr->applicable())
-                              ? asm_kernel_ptr.get()
-                              : nullptr;
-  return cpu_spec;
-}
 template <typename IDX>
 void _TestSpmmCopyLhs() {
  for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) {
@@ -113,18 +98,7 @@ void _TestSpmmCopyLhs() {
      out[k] += ns_op::CopyLhs<IDX>::Call(lhs + k, nullptr);
    }
-    // Calculation of output using intel path - 'out_intel_kernel'
+    CheckResult(exp, out, dim);
-    auto* cpu_spec =
-        generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::CopyLhs<IDX>>>();
-    if (cpu_spec) {
-      IDX out_intel_kernel[dim];
-      GenerateZeroData(out_intel_kernel, dim);
-      cpu_spec->run(out_intel_kernel, lhs, nullptr, dim);
-      CheckResult(exp, out, out_intel_kernel, dim);
-    } else {
-      IDX* out_intel_kernel = nullptr;
-      CheckResult(exp, out, out_intel_kernel, dim);
-    }
  }
 }
@@ -149,18 +123,7 @@ void _TestSpmmCopyRhs() {
      out[k] += ns_op::CopyRhs<IDX>::Call(nullptr, rhs + k);
    }
-    // Calculation of output using intel path - 'out_intel_kernel'
+    CheckResult(exp, out, dim);
-    auto* cpu_spec =
-        generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::CopyRhs<IDX>>>();
-    if (cpu_spec) {
-      IDX out_intel_kernel[dim];
-      GenerateZeroData(out_intel_kernel, dim);
-      cpu_spec->run(out_intel_kernel, nullptr, rhs, dim);
-      CheckResult(exp, out, out_intel_kernel, dim);
-    } else {
-      IDX* out_intel_kernel = nullptr;
-      CheckResult(exp, out, out_intel_kernel, dim);
-    }
  }
 }
@@ -186,18 +149,7 @@ void _TestSpmmAdd() {
      out[k] += ns_op::Add<IDX>::Call(lhs + k, rhs + k);
    }
-    // Calculation of output using intel path - 'out_intel_kernel'
+    CheckResult(exp, out, dim);
-    auto* cpu_spec =
-        generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Add<IDX>>>();
-    if (cpu_spec) {
-      IDX out_intel_kernel[dim];
-      GenerateZeroData(out_intel_kernel, dim);
-      cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
-      CheckResult(exp, out, out_intel_kernel, dim);
-    } else {
-      IDX* out_intel_kernel = nullptr;
-      CheckResult(exp, out, out_intel_kernel, dim);
-    }
  }
 }
@@ -223,18 +175,7 @@ void _TestSpmmSub() {
      out[k] += ns_op::Sub<IDX>::Call(lhs + k, rhs + k);
    }
-    // Calculation of output using intel path - 'out_intel_kernel'
+    CheckResult(exp, out, dim);
-    auto* cpu_spec =
-        generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Sub<IDX>>>();
-    if (cpu_spec) {
-      IDX out_intel_kernel[dim];
-      GenerateZeroData(out_intel_kernel, dim);
-      cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
-      CheckResult(exp, out, out_intel_kernel, dim);
-    } else {
-      IDX* out_intel_kernel = nullptr;
-      CheckResult(exp, out, out_intel_kernel, dim);
-    }
  }
 }
@@ -260,18 +201,7 @@ void _TestSpmmMul() {
      out[k] += ns_op::Mul<IDX>::Call(lhs + k, rhs + k);
    }
-    // Calculation of output using intel path - 'out_intel_kernel'
+    CheckResult(exp, out, dim);
-    auto* cpu_spec =
-        generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Mul<IDX>>>();
-    if (cpu_spec) {
-      IDX out_intel_kernel[dim];
-      GenerateZeroData(out_intel_kernel, dim);
-      cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
-      CheckResult(exp, out, out_intel_kernel, dim);
-    } else {
-      IDX* out_intel_kernel = nullptr;
-      CheckResult(exp, out, out_intel_kernel, dim);
-    }
  }
 }
@@ -297,18 +227,7 @@ void _TestSpmmDiv() {
      out[k] += ns_op::Div<IDX>::Call(lhs + k, rhs + k);
    }
-    // Calculation of output using intel path - 'out_intel_kernel'
+    CheckResult(exp, out, dim);
-    auto* cpu_spec =
-        generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Div<IDX>>>();
-    if (cpu_spec) {
-      IDX out_intel_kernel[dim];
-      GenerateZeroData(out_intel_kernel, dim);
-      cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
-      CheckResult(exp, out, out_intel_kernel, dim);
-    } else {
-      IDX* out_intel_kernel = nullptr;
-      CheckResult(exp, out, out_intel_kernel, dim);
-    }
  }
 }
@@ -316,5 +235,4 @@ TEST(SpmmTest, TestSpmmDiv) {
  _TestSpmmDiv<float>();
  _TestSpmmDiv<double>();
 }
-#endif  // USE_AVX
 #endif  // _WIN32
--- a/tests/python/common/ops/test_ops.py
+++ b/tests/python/common/ops/test_ops.py
@@ -487,8 +487,6 @@ def test_use_libxsmm_switch():
    x = torch.ones(3, 2, requires_grad=True)
    y = torch.arange(1, 13).float().view(6, 2).requires_grad_()
-    assert dgl.is_libxsmm_enabled()
-    dgl.ops.u_mul_e_sum(g, x, y)
    dgl.use_libxsmm(False)
    assert ~dgl.is_libxsmm_enabled()
    dgl.ops.u_mul_e_sum(g, x, y)