Unverified Commit 87fb7ed0 authored by Daniil Sizov's avatar Daniil Sizov Committed by GitHub
Browse files

[Config] Enable libxsmm by default for AVX cpu (#5165)

* Enable AVX by default

* Fix linting errors

* Fix win64 build (libxsmm not linked)

Libxsmm on Win64 is not linked, should be disabled by default

* Fix clang format issues

* Change lower supported cpu version to LIBXSMM_X86_AVX2

Change lower supported cpu version to LIBXSMM_X86_AVX2 to address https://github.com/dmlc/dgl/issues/3459

 issue

* Fix unit test

Remove assumption that libxsmm is enabled in the config by default (only true for intel CPUs with AVX2 instructions)

---------
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-15-137.us-west-2.compute.internal>
Co-authored-by: default avatarQuan (Andy) Gan <coin2028@hotmail.com>
parent 8d99d30a
...@@ -24,7 +24,6 @@ endif() ...@@ -24,7 +24,6 @@ endif()
# Alernatively, use cmake -DOPTION=VALUE through command-line. # Alernatively, use cmake -DOPTION=VALUE through command-line.
dgl_option(USE_CUDA "Build with CUDA" OFF) dgl_option(USE_CUDA "Build with CUDA" OFF)
dgl_option(USE_OPENMP "Build with OpenMP" ON) dgl_option(USE_OPENMP "Build with OpenMP" ON)
dgl_option(USE_AVX "Build with AVX optimization" OFF)
dgl_option(USE_LIBXSMM "Build with LIBXSMM library optimization" ON) dgl_option(USE_LIBXSMM "Build with LIBXSMM library optimization" ON)
dgl_option(USE_TVM "Build with TVM kernels" OFF) dgl_option(USE_TVM "Build with TVM kernels" OFF)
dgl_option(BUILD_CPP_TEST "Build cpp unittest executables" OFF) dgl_option(BUILD_CPP_TEST "Build cpp unittest executables" OFF)
...@@ -103,17 +102,11 @@ if(USE_OPENMP) ...@@ -103,17 +102,11 @@ if(USE_OPENMP)
message(STATUS "Build with OpenMP.") message(STATUS "Build with OpenMP.")
endif(USE_OPENMP) endif(USE_OPENMP)
if(USE_AVX) if(USE_LIBXSMM)
if(USE_LIBXSMM) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_LIBXSMM -DDGL_CPU_LLC_SIZE=40000000")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_AVX -DUSE_LIBXSMM -DDGL_CPU_LLC_SIZE=40000000") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_LIBXSMM -DDGL_CPU_LLC_SIZE=40000000")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_AVX -DUSE_LIBXSMM -DDGL_CPU_LLC_SIZE=40000000") message(STATUS "Build with LIBXSMM optimization.")
message(STATUS "Build with LIBXSMM optimization.") endif(USE_LIBXSMM)
else(USE_LIBXSMM)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_AVX")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_AVX")
message(STATUS "Build with AVX optimization.")
endif(USE_LIBXSMM)
endif(USE_AVX)
if ((NOT MSVC) AND USE_EPOLL) if ((NOT MSVC) AND USE_EPOLL)
INCLUDE(CheckIncludeFile) INCLUDE(CheckIncludeFile)
......
...@@ -22,8 +22,8 @@ class Config { ...@@ -22,8 +22,8 @@ class Config {
bool IsLibxsmmAvailable() const; bool IsLibxsmmAvailable() const;
private: private:
Config() = default; Config();
bool libxsmm_ = true; bool libxsmm_;
}; };
} // namespace runtime } // namespace runtime
......
...@@ -19,56 +19,14 @@ ...@@ -19,56 +19,14 @@
#include "spmm_binary_ops.h" #include "spmm_binary_ops.h"
#if !defined(_WIN32) #if !defined(_WIN32)
#ifdef USE_AVX
#include "intel/cpu_support.h"
#ifdef USE_LIBXSMM #ifdef USE_LIBXSMM
#include "spmm_blocking_libxsmm.h" #include "spmm_blocking_libxsmm.h"
#endif // USE_LIBXSMM #endif // USE_LIBXSMM
#endif // USE_AVX
#endif // _WIN32 #endif // _WIN32
namespace dgl { namespace dgl {
namespace aten { namespace aten {
namespace cpu { namespace cpu {
#if !defined(_WIN32)
#ifdef USE_AVX
/**
* @brief CPU kernel of SpMM on Csr format using Xbyak.
* @param cpu_spec JIT'ed kernel
* @param bcast Broadcast information.
* @param csr The Csr matrix.
* @param X The feature on source nodes.
* @param W The feature on edges.
* @param O The result feature on destination nodes.
* @note it uses node parallel strategy, different threads are responsible
* for the computation of different nodes. For each edge, it uses the
* JIT'ed kernel.
*/
template <typename IdType, typename DType, typename Op>
void SpMMSumCsrXbyak(
dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast,
const CSRMatrix& csr, const DType* X, const DType* W, DType* O) {
const bool has_idx = !IsNullArray(csr.data);
const IdType* indptr = csr.indptr.Ptr<IdType>();
const IdType* indices = csr.indices.Ptr<IdType>();
const IdType* edges = csr.data.Ptr<IdType>();
int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
for (auto rid = b; rid < e; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim;
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
const IdType eid = has_idx ? edges[j] : j;
cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim);
}
}
});
}
#endif // USE_AVX
#endif // _WIN32
/** /**
* @brief Naive CPU kernel of SpMM on Csr format. * @brief Naive CPU kernel of SpMM on Csr format.
* @param cpu_spec JIT'ed kernel * @param cpu_spec JIT'ed kernel
...@@ -142,7 +100,6 @@ void SpMMSumCsr( ...@@ -142,7 +100,6 @@ void SpMMSumCsr(
CHECK_NOTNULL(W); CHECK_NOTNULL(W);
} }
#if !defined(_WIN32) #if !defined(_WIN32)
#ifdef USE_AVX
#ifdef USE_LIBXSMM #ifdef USE_LIBXSMM
const bool no_libxsmm = bcast.use_bcast || const bool no_libxsmm = bcast.use_bcast ||
std::is_same<DType, double>::value || std::is_same<DType, double>::value ||
...@@ -151,27 +108,12 @@ void SpMMSumCsr( ...@@ -151,27 +108,12 @@ void SpMMSumCsr(
SpMMSumCsrLibxsmm<IdType, DType, Op>(bcast, csr, ufeat, efeat, out); SpMMSumCsrLibxsmm<IdType, DType, Op>(bcast, csr, ufeat, efeat, out);
} else { } else {
#endif // USE_LIBXSMM #endif // USE_LIBXSMM
typedef dgl::ElemWiseAddUpdate<Op> ElemWiseUpd;
/* Prepare an assembler kernel */
static std::unique_ptr<ElemWiseUpd> asm_kernel_ptr(
(dgl::IntelKernel<>::IsEnabled()) ? new ElemWiseUpd() : nullptr);
/* Distribute the kernel among OMP threads */
ElemWiseUpd* cpu_spec = (asm_kernel_ptr && asm_kernel_ptr->applicable())
? asm_kernel_ptr.get()
: nullptr;
if (cpu_spec && bcast.out_len > 16 && !bcast.use_bcast) {
SpMMSumCsrXbyak<IdType, DType, Op>(cpu_spec, bcast, csr, X, W, O);
} else {
#endif // USE_AVX
#endif // _WIN32 #endif // _WIN32
SpMMSumCsrNaive<IdType, DType, Op>(bcast, csr, X, W, O); SpMMSumCsrNaive<IdType, DType, Op>(bcast, csr, X, W, O);
#if !defined(_WIN32) #if !defined(_WIN32)
#ifdef USE_AVX
}
#ifdef USE_LIBXSMM #ifdef USE_LIBXSMM
} }
#endif // USE_LIBXSMM #endif // USE_LIBXSMM
#endif // USE_AVX
#endif // _WIN32 #endif // _WIN32
} }
...@@ -272,7 +214,6 @@ void SpMMCmpCsr( ...@@ -272,7 +214,6 @@ void SpMMCmpCsr(
CHECK_NOTNULL(argW); CHECK_NOTNULL(argW);
} }
#if !defined(_WIN32) #if !defined(_WIN32)
#ifdef USE_AVX
#ifdef USE_LIBXSMM #ifdef USE_LIBXSMM
const bool no_libxsmm = bcast.use_bcast || const bool no_libxsmm = bcast.use_bcast ||
...@@ -283,7 +224,6 @@ void SpMMCmpCsr( ...@@ -283,7 +224,6 @@ void SpMMCmpCsr(
bcast, csr, ufeat, efeat, out, argu, arge); bcast, csr, ufeat, efeat, out, argu, arge);
} else { } else {
#endif // USE_LIBXSMM #endif // USE_LIBXSMM
#endif // USE_AVX
#endif // _WIN32 #endif // _WIN32
runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) { runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
...@@ -313,11 +253,9 @@ void SpMMCmpCsr( ...@@ -313,11 +253,9 @@ void SpMMCmpCsr(
} }
}); });
#if !defined(_WIN32) #if !defined(_WIN32)
#ifdef USE_AVX
#ifdef USE_LIBXSMM #ifdef USE_LIBXSMM
} }
#endif // USE_LIBXSMM #endif // USE_LIBXSMM
#endif // USE_AVX
#endif // _WIN32 #endif // _WIN32
} }
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
#include <algorithm> #include <algorithm>
#if !defined(_WIN32) #if !defined(_WIN32)
#ifdef USE_AVX
#ifdef USE_LIBXSMM #ifdef USE_LIBXSMM
#include <libxsmm.h> #include <libxsmm.h>
#include <unistd.h> #include <unistd.h>
...@@ -589,7 +588,6 @@ void SpMMCmpCsrLibxsmm( ...@@ -589,7 +588,6 @@ void SpMMCmpCsrLibxsmm(
} // namespace dgl } // namespace dgl
#endif // USE_LIBXSMM #endif // USE_LIBXSMM
#endif // USE_AVX
#endif // _WIN32 #endif // _WIN32
#endif // DGL_ARRAY_CPU_SPMM_BLOCKING_LIBXSMM_H_ #endif // DGL_ARRAY_CPU_SPMM_BLOCKING_LIBXSMM_H_
...@@ -6,12 +6,23 @@ ...@@ -6,12 +6,23 @@
#include <dgl/runtime/config.h> #include <dgl/runtime/config.h>
#include <dgl/runtime/registry.h> #include <dgl/runtime/registry.h>
#include <libxsmm_cpuid.h>
using namespace dgl::runtime; using namespace dgl::runtime;
namespace dgl { namespace dgl {
namespace runtime { namespace runtime {
Config::Config() {
#if !defined(_WIN32) && defined(USE_LIBXSMM)
int cpu_id = libxsmm_cpuid_x86();
// Enable libxsmm on AVX machines by default
libxsmm_ = LIBXSMM_X86_AVX2 <= cpu_id && cpu_id <= LIBXSMM_X86_ALLFEAT;
#else
libxsmm_ = false;
#endif
}
void Config::EnableLibxsmm(bool b) { libxsmm_ = b; } void Config::EnableLibxsmm(bool b) { libxsmm_ = b; }
bool Config::IsLibxsmmAvailable() const { return libxsmm_; } bool Config::IsLibxsmmAvailable() const { return libxsmm_; }
......
#if !defined(_WIN32) #if !defined(_WIN32)
#ifdef USE_AVX
#include <../src/array/cpu/spmm.h> #include <../src/array/cpu/spmm.h>
#include <dgl/array.h> #include <dgl/array.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
...@@ -75,28 +74,14 @@ void Div(T* exp, T* out, T* lhs, T* rhs, int dim) { ...@@ -75,28 +74,14 @@ void Div(T* exp, T* out, T* lhs, T* rhs, int dim) {
} }
template <class T> template <class T>
void CheckResult(T* exp, T* out, T* out_intel_kernel, int dim) { void CheckResult(T* exp, T* out, int dim) {
for (int i = 0; i < dim; i++) { for (int i = 0; i < dim; i++) {
ASSERT_TRUE(exp[i] == out[i]); ASSERT_TRUE(exp[i] == out[i]);
if (out_intel_kernel != nullptr) {
ASSERT_TRUE(out[i] == out_intel_kernel[i]);
}
} }
} }
} // namespace } // namespace
template <class ElemWiseUpd>
ElemWiseUpd* generic_ElemWiseUpd() {
static std::unique_ptr<ElemWiseUpd> asm_kernel_ptr(
(dgl::IntelKernel<>::IsEnabled()) ? new ElemWiseUpd() : nullptr);
ElemWiseUpd* cpu_spec = (asm_kernel_ptr && asm_kernel_ptr->applicable())
? asm_kernel_ptr.get()
: nullptr;
return cpu_spec;
}
template <typename IDX> template <typename IDX>
void _TestSpmmCopyLhs() { void _TestSpmmCopyLhs() {
for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) { for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) {
...@@ -113,18 +98,7 @@ void _TestSpmmCopyLhs() { ...@@ -113,18 +98,7 @@ void _TestSpmmCopyLhs() {
out[k] += ns_op::CopyLhs<IDX>::Call(lhs + k, nullptr); out[k] += ns_op::CopyLhs<IDX>::Call(lhs + k, nullptr);
} }
// Calculation of output using intel path - 'out_intel_kernel' CheckResult(exp, out, dim);
auto* cpu_spec =
generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::CopyLhs<IDX>>>();
if (cpu_spec) {
IDX out_intel_kernel[dim];
GenerateZeroData(out_intel_kernel, dim);
cpu_spec->run(out_intel_kernel, lhs, nullptr, dim);
CheckResult(exp, out, out_intel_kernel, dim);
} else {
IDX* out_intel_kernel = nullptr;
CheckResult(exp, out, out_intel_kernel, dim);
}
} }
} }
...@@ -149,18 +123,7 @@ void _TestSpmmCopyRhs() { ...@@ -149,18 +123,7 @@ void _TestSpmmCopyRhs() {
out[k] += ns_op::CopyRhs<IDX>::Call(nullptr, rhs + k); out[k] += ns_op::CopyRhs<IDX>::Call(nullptr, rhs + k);
} }
// Calculation of output using intel path - 'out_intel_kernel' CheckResult(exp, out, dim);
auto* cpu_spec =
generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::CopyRhs<IDX>>>();
if (cpu_spec) {
IDX out_intel_kernel[dim];
GenerateZeroData(out_intel_kernel, dim);
cpu_spec->run(out_intel_kernel, nullptr, rhs, dim);
CheckResult(exp, out, out_intel_kernel, dim);
} else {
IDX* out_intel_kernel = nullptr;
CheckResult(exp, out, out_intel_kernel, dim);
}
} }
} }
...@@ -186,18 +149,7 @@ void _TestSpmmAdd() { ...@@ -186,18 +149,7 @@ void _TestSpmmAdd() {
out[k] += ns_op::Add<IDX>::Call(lhs + k, rhs + k); out[k] += ns_op::Add<IDX>::Call(lhs + k, rhs + k);
} }
// Calculation of output using intel path - 'out_intel_kernel' CheckResult(exp, out, dim);
auto* cpu_spec =
generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Add<IDX>>>();
if (cpu_spec) {
IDX out_intel_kernel[dim];
GenerateZeroData(out_intel_kernel, dim);
cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
CheckResult(exp, out, out_intel_kernel, dim);
} else {
IDX* out_intel_kernel = nullptr;
CheckResult(exp, out, out_intel_kernel, dim);
}
} }
} }
...@@ -223,18 +175,7 @@ void _TestSpmmSub() { ...@@ -223,18 +175,7 @@ void _TestSpmmSub() {
out[k] += ns_op::Sub<IDX>::Call(lhs + k, rhs + k); out[k] += ns_op::Sub<IDX>::Call(lhs + k, rhs + k);
} }
// Calculation of output using intel path - 'out_intel_kernel' CheckResult(exp, out, dim);
auto* cpu_spec =
generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Sub<IDX>>>();
if (cpu_spec) {
IDX out_intel_kernel[dim];
GenerateZeroData(out_intel_kernel, dim);
cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
CheckResult(exp, out, out_intel_kernel, dim);
} else {
IDX* out_intel_kernel = nullptr;
CheckResult(exp, out, out_intel_kernel, dim);
}
} }
} }
...@@ -260,18 +201,7 @@ void _TestSpmmMul() { ...@@ -260,18 +201,7 @@ void _TestSpmmMul() {
out[k] += ns_op::Mul<IDX>::Call(lhs + k, rhs + k); out[k] += ns_op::Mul<IDX>::Call(lhs + k, rhs + k);
} }
// Calculation of output using intel path - 'out_intel_kernel' CheckResult(exp, out, dim);
auto* cpu_spec =
generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Mul<IDX>>>();
if (cpu_spec) {
IDX out_intel_kernel[dim];
GenerateZeroData(out_intel_kernel, dim);
cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
CheckResult(exp, out, out_intel_kernel, dim);
} else {
IDX* out_intel_kernel = nullptr;
CheckResult(exp, out, out_intel_kernel, dim);
}
} }
} }
...@@ -297,18 +227,7 @@ void _TestSpmmDiv() { ...@@ -297,18 +227,7 @@ void _TestSpmmDiv() {
out[k] += ns_op::Div<IDX>::Call(lhs + k, rhs + k); out[k] += ns_op::Div<IDX>::Call(lhs + k, rhs + k);
} }
// Calculation of output using intel path - 'out_intel_kernel' CheckResult(exp, out, dim);
auto* cpu_spec =
generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Div<IDX>>>();
if (cpu_spec) {
IDX out_intel_kernel[dim];
GenerateZeroData(out_intel_kernel, dim);
cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
CheckResult(exp, out, out_intel_kernel, dim);
} else {
IDX* out_intel_kernel = nullptr;
CheckResult(exp, out, out_intel_kernel, dim);
}
} }
} }
...@@ -316,5 +235,4 @@ TEST(SpmmTest, TestSpmmDiv) { ...@@ -316,5 +235,4 @@ TEST(SpmmTest, TestSpmmDiv) {
_TestSpmmDiv<float>(); _TestSpmmDiv<float>();
_TestSpmmDiv<double>(); _TestSpmmDiv<double>();
} }
#endif // USE_AVX
#endif // _WIN32 #endif // _WIN32
...@@ -487,8 +487,6 @@ def test_use_libxsmm_switch(): ...@@ -487,8 +487,6 @@ def test_use_libxsmm_switch():
x = torch.ones(3, 2, requires_grad=True) x = torch.ones(3, 2, requires_grad=True)
y = torch.arange(1, 13).float().view(6, 2).requires_grad_() y = torch.arange(1, 13).float().view(6, 2).requires_grad_()
assert dgl.is_libxsmm_enabled()
dgl.ops.u_mul_e_sum(g, x, y)
dgl.use_libxsmm(False) dgl.use_libxsmm(False)
assert ~dgl.is_libxsmm_enabled() assert ~dgl.is_libxsmm_enabled()
dgl.ops.u_mul_e_sum(g, x, y) dgl.ops.u_mul_e_sum(g, x, y)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment