Unverified Commit f8ebcd7f authored by pawelpiotrowicz's avatar pawelpiotrowicz Committed by GitHub
Browse files

[Performance] Dynamic cpu kernel V3 for SpMMSumCsr all Ops (#2309)



* support AVX512

* env DGL_CPU_INTEL_KERNEL_ENABLED=1

* env DGL_CPU_INTEL_KERNEL_LOG=1

* Add unittest test_spmm.cc
Co-authored-by: default avatarIzabela Mazur <izabela.mazur@intel.com>
Co-authored-by: default avatarMichal Szarmach <michal.szarmach@intel.com>

Review patch
parent 62b4bbb4
...@@ -23,3 +23,6 @@ ...@@ -23,3 +23,6 @@
[submodule "third_party/thrust"] [submodule "third_party/thrust"]
path = third_party/thrust path = third_party/thrust
url = https://github.com/NVIDIA/thrust.git url = https://github.com/NVIDIA/thrust.git
[submodule "third_party/xbyak"]
path = third_party/xbyak
url = https://github.com/herumi/xbyak
...@@ -59,6 +59,7 @@ include_directories("third_party/dmlc-core/include") ...@@ -59,6 +59,7 @@ include_directories("third_party/dmlc-core/include")
include_directories("third_party/minigun/minigun") include_directories("third_party/minigun/minigun")
include_directories("third_party/minigun/third_party/moderngpu/src") include_directories("third_party/minigun/third_party/moderngpu/src")
include_directories("third_party/phmap/") include_directories("third_party/phmap/")
include_directories("third_party/xbyak/")
# initial variables # initial variables
set(DGL_LINKER_LIBS "") set(DGL_LINKER_LIBS "")
......
...@@ -22,3 +22,15 @@ Data Repository ...@@ -22,3 +22,15 @@ Data Repository
* ``DGL_DOWNLOAD_DIR``: * ``DGL_DOWNLOAD_DIR``:
* Values: String (default="${HOME}/.dgl") * Values: String (default="${HOME}/.dgl")
* The local directory to cache the downloaded data. * The local directory to cache the downloaded data.
Intel CPU Performance Options
---------------
* ``DGL_CPU_INTEL_KERNEL_ENABLED``:
* Values: int (default='0')
* Use dynamic cpu kernels.
* Suggested values: 1
* ``DGL_CPU_INTEL_KERNEL_LOG``:
* Values: int (default='0')
* Show diagnostic message (debug mode).
* Suggested values: 1
/*!
* Copyright (c) 2019 by Contributors
* \file intel/cpu_support.h
* \brief Intel CPU support
* \author Pawel Piotrowicz <pawel.piotrowicz@intel.com>
*/
#ifndef INTEL_CPU_SUPPORT_H_
#define INTEL_CPU_SUPPORT_H_
#include <memory>
#include <tuple>
#include <type_traits>
#include "dmlc/logging.h"
#include "meta_utils.h"
#include "xbyak/xbyak.h"
#include "xbyak/xbyak_util.h"
namespace dgl {
typedef std::tuple<float, double> supported_types;
#ifndef log_intel
#define log_intel(x) \
if (IntelKernel<>::IsLogEnabled()) { \
LOG(INFO) << x; \
}
#endif
static inline Xbyak::Zmm make_zmm(const Xbyak::Xmm &v) {
return Xbyak::Zmm(v.getIdx());
}
template <int version = 0>
struct IntelKernel {
static int64_t GetValue() {
int64_t v = 0;
const char *label = "DGL_CPU_INTEL_KERNEL_ENABLED";
const char *ptr = std::getenv(label);
if (ptr) {
v = atoll(ptr);
log_intel(label << "=>" << v);
}
return v;
}
static int64_t IsEnabled() {
static int64_t r = IntelKernel<version>::GetValue();
return r;
}
static int IsLogEnabled() {
static int r = (std::getenv("DGL_CPU_INTEL_KERNEL_LOG")) ? 1 : 0;
return r;
}
};
/*!
* \brief Element-wise addition kernel using Intel AVX512 instructions.
* \note it uses AVX512.
*/
template <class Op>
class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
public:
typedef typename Op::type DType;
static_assert(
std::is_base_of<std::true_type,
utils::has_type<DType, supported_types>>::value,
"Use case fail dgl::ElemWiseAddUpdate< Operator<DType> > DType is not "
"supported !");
protected:
const Xbyak::Reg64 &r_out_;
const Xbyak::Reg64 &r_left_;
const Xbyak::Reg64 &r_right;
const Xbyak::Reg64 &r_size_;
/* [functional] Does kernel is applicable on this machine ? */
bool applicable_;
public:
static constexpr int UNIT_SIZE_BYTES = sizeof(DType);
static constexpr int BITS_IN_BYTES = 8;
static constexpr int REG_BIT_SIZE = 512;
static constexpr int UNIT_PER_REG =
REG_BIT_SIZE / (UNIT_SIZE_BYTES * BITS_IN_BYTES);
template <class TType, class R1, class R2,
utils::CheckCmp<TType, float> = true>
void alias_load(R1 r1, R2 r2) {
vmovups(r1, r2);
}
template <class TType, class R1, class R2,
utils::CheckCmp<TType, double> = true>
void alias_load(R1 r1, R2 r2) {
vmovupd(r1, r2);
}
template <class TType, class R1, class R2,
utils::CheckCmp<TType, float> = true>
void alias_save(R1 r1, R2 r2) {
alias_load<TType>(r1, r2);
}
template <class TType, class R1, class R2,
utils::CheckCmp<TType, double> = true>
void alias_save(R1 r1, R2 r2) {
alias_load<TType>(r1, r2);
}
template <class TType, class R1, class R2, class R3,
utils::CheckCmp<TType, float> = true>
void alias_ADD(R1 r1, R2 r2, R3 r3) {
vaddps(r1, r2, r3);
}
template <class TType, class R1, class R2, class R3,
utils::CheckCmp<TType, double> = true>
void alias_ADD(R1 r1, R2 r2, R3 r3) {
vaddpd(r1, r2, r3);
}
template <class TType, class R1, class R2, class R3,
utils::CheckCmp<TType, float> = true>
void alias_SUB(R1 r1, R2 r2, R3 r3) {
vsubps(r1, r2, r3);
}
template <class TType, class R1, class R2, class R3,
utils::CheckCmp<TType, double> = true>
void alias_SUB(R1 r1, R2 r2, R3 r3) {
vsubpd(r1, r2, r3);
}
template <class TType, class R1, class R2, class R3,
utils::CheckCmp<TType, float> = true>
void alias_DIV(R1 r1, R2 r2, R3 r3) {
vdivps(r1, r2, r3);
}
template <class TType, class R1, class R2, class R3,
utils::CheckCmp<TType, double> = true>
void alias_DIV(R1 r1, R2 r2, R3 r3) {
vdivpd(r1, r2, r3);
}
template <class TType, class R1, class R2, class R3,
utils::CheckCmp<TType, float> = true>
void alias_MUL(R1 r1, R2 r2, R3 r3) {
vmulps(r1, r2, r3);
}
template <class TType, class R1, class R2, class R3,
utils::CheckCmp<TType, double> = true>
void alias_MUL(R1 r1, R2 r2, R3 r3) {
vmulpd(r1, r2, r3);
}
template <class Operator,
utils::Verify<Operator, ::dgl::aten::cpu::op::CopyLhs,
supported_types> = true>
void full_chunk_loop_operations() {
typedef typename Operator::type IType;
alias_load<IType>(zmm0, ptr[r_out_ + r9 * sizeof(IType)]);
alias_load<IType>(zmm1, ptr[r_left_ + r9 * sizeof(IType)]);
alias_ADD<IType>(zmm2, zmm0, zmm1);
alias_save<IType>(ptr[r_out_ + r9 * sizeof(IType)], zmm2);
}
template <class Operator,
utils::Verify<Operator, ::dgl::aten::cpu::op::CopyRhs,
supported_types> = true>
void full_chunk_loop_operations() {
typedef typename Operator::type IType;
alias_load<IType>(zmm0, ptr[r_out_ + r9 * sizeof(IType)]);
alias_load<IType>(zmm1, ptr[r_right + r9 * sizeof(IType)]);
alias_ADD<IType>(zmm2, zmm0, zmm1);
alias_save<IType>(ptr[r_out_ + r9 * sizeof(IType)], zmm2);
}
template <class T>
void loop_pre() {
alias_load<T>(zmm0, ptr[r_out_ + r9 * sizeof(T)]);
alias_load<T>(zmm1, ptr[r_left_ + r9 * sizeof(T)]);
alias_load<T>(zmm2, ptr[r_right + r9 * sizeof(T)]);
}
template <class T>
void loop_post() {
alias_ADD<T>(zmm2, zmm0, zmm2);
alias_save<T>(ptr[r_out_ + r9 * sizeof(T)], zmm2);
}
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Add,
supported_types> = true>
void full_chunk_loop_operations() {
typedef typename Operator::type IType;
loop_pre<IType>();
alias_ADD<IType>(zmm2, zmm1, zmm2);
loop_post<IType>();
}
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Sub,
supported_types> = true>
void full_chunk_loop_operations() {
typedef typename Operator::type IType;
loop_pre<IType>();
alias_SUB<IType>(zmm2, zmm1, zmm2);
loop_post<IType>();
}
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Div,
supported_types> = true>
void full_chunk_loop_operations() {
typedef typename Operator::type IType;
loop_pre<IType>();
alias_DIV<IType>(zmm2, zmm1, zmm2);
loop_post<IType>();
}
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Mul,
supported_types> = true>
void full_chunk_loop_operations() {
typedef typename Operator::type IType;
loop_pre<IType>();
alias_MUL<IType>(zmm2, zmm1, zmm2);
loop_post<IType>();
}
template <class Operator,
utils::Verify<Operator, ::dgl::aten::cpu::op::CopyLhs,
supported_types> = true>
void remainder_operations(const Xbyak::Opmask mask) {
typedef typename Operator::type IType;
alias_load<IType>(make_zmm(zmm2) | mask, ptr[r_left_ + r9 * sizeof(IType)]);
}
template <class Operator,
utils::Verify<Operator, ::dgl::aten::cpu::op::CopyRhs,
supported_types> = true>
void remainder_operations(const Xbyak::Opmask mask) {
typedef typename Operator::type IType;
alias_load<IType>(make_zmm(zmm2) | mask, ptr[r_right + r9 * sizeof(IType)]);
}
template <class T>
void remainder_fetch_LR(const Xbyak::Opmask mask) {
alias_load<T>(make_zmm(zmm2) | mask, ptr[r_left_ + r9 * sizeof(T)]);
alias_load<T>(make_zmm(zmm1) | mask, ptr[r_right + r9 * sizeof(T)]);
}
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Mul,
supported_types> = true>
void remainder_operations(const Xbyak::Opmask mask) {
typedef typename Operator::type IType;
remainder_fetch_LR<IType>(mask);
alias_MUL<IType>(zmm2, zmm2, zmm1);
}
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Add,
supported_types> = true>
void remainder_operations(const Xbyak::Opmask mask) {
typedef typename Operator::type IType;
remainder_fetch_LR<IType>(mask);
alias_ADD<DType>(zmm2, zmm2, zmm1);
}
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Div,
supported_types> = true>
void remainder_operations(const Xbyak::Opmask mask) {
typedef typename Operator::type IType;
remainder_fetch_LR<IType>(mask);
alias_DIV<DType>(zmm2, zmm2, zmm1);
}
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Sub,
supported_types> = true>
void remainder_operations(const Xbyak::Opmask mask) {
typedef typename Operator::type IType;
remainder_fetch_LR<IType>(mask);
alias_SUB<DType>(zmm2, zmm2, zmm1);
}
ElemWiseAddUpdate()
: r_out_(rdi),
r_left_(rsi),
r_right(rdx),
r_size_(rcx),
applicable_(false) {
static Xbyak::util::Cpu current_cpu;
/* Default case for all */
if (current_cpu.has(Xbyak::util::Cpu::tAVX512F)) {
/* prepare REMAINDER */
mov(r8, r_size_);
and_(r8,
UNIT_PER_REG - 1); // r8_modulo = size/(sizeof(zmm)/sizeof(float))
xor_(r9, r9); // reset r9
cmp(r_size_, UNIT_PER_REG); // if ( size < 16 ) { }
jl("remainder");
/* decrease divident */
sub(r_size_, r8); // prepare alignment chunks
cmp(r_size_, 0); // do we have any full chunks ?
jz("remainder");
L("for_i");
full_chunk_loop_operations<Op>();
add(r9, UNIT_PER_REG); // r9+=sizeof(zmm)/sizeof(float)
cmp(r_size_, r9); // more full chunks ?
jnz("for_i");
L("remainder");
cmp(r8, 0); // do we have a remainder ?
jz("done");
/* prepare a bitmask for k1 */
mov(rax, 1);
mov(r_size_, r8);
sal(rax, cl);
dec(rax); // k1= (1 << r8 )-1
kmovw(k1, eax); // set bitmask
alias_load<DType>(make_zmm(zmm0) | k1,
ptr[r_out_ + r9 * UNIT_SIZE_BYTES]);
remainder_operations<Op>(k1);
alias_ADD<DType>(zmm3, zmm2, zmm0);
alias_save<DType>(ptr[r_out_ + r9 * UNIT_SIZE_BYTES],
make_zmm(zmm3) | k1);
L("done");
applicable_ = true;
log_intel("AVX512F cpu kernel is ready");
}
ret();
}
bool applicable() const { return applicable_; }
template <class... P>
void run(P... args) {
((void (*)(P...))(this)->getCode())(args...);
}
};
} // namespace dgl
#endif // INTEL_CPU_SUPPORT_H_
/*!
* Copyright (c) 2019 by Contributors
* \file intel/meta_utils.h
* \brief Meta programming utils
* \author Pawel Piotrowicz <pawel.piotrowicz@intel.com>
*/
#ifndef INTEL_META_UTILS_H_
#define INTEL_META_UTILS_H_
#include <tuple>
namespace dgl {
namespace utils {
template <typename T, typename Tuple>
struct has_type;
template <typename T>
struct has_type<T, std::tuple<>> : std::false_type {};
template <typename T, typename U, typename... Ts>
struct has_type<T, std::tuple<U, Ts...>> : has_type<T, std::tuple<Ts...>> {};
template <typename T, typename... Ts>
struct has_type<T, std::tuple<T, Ts...>> : std::true_type {};
template <class OCmp, template <class> class ToP, class Tup,
int ok = std::tuple_size<Tup>::value>
struct DeepType;
template <class OCmp, template <class> class ToP, class Tup>
struct DeepType<OCmp, ToP, Tup, 1> {
typedef typename std::tuple_element<0, Tup>::type EL1;
enum { value = std::is_same<OCmp, ToP<EL1>>::value };
};
template <class OCmp, template <class> class ToP, class Tup>
struct DeepType<OCmp, ToP, Tup, 2> {
typedef typename std::tuple_element<0, Tup>::type EL1;
typedef typename std::tuple_element<1, Tup>::type EL2;
enum {
value = (std::is_same<OCmp, ToP<EL1>>::value ||
std::is_same<OCmp, ToP<EL2>>::value)
};
};
template <class OCmp, template <class> class ToP, class Tup>
struct DeepType<OCmp, ToP, Tup, 3> {
typedef typename std::tuple_element<0, Tup>::type EL1;
typedef typename std::tuple_element<1, Tup>::type EL2;
typedef typename std::tuple_element<2, Tup>::type EL3;
enum {
value = (std::is_same<OCmp, ToP<EL1>>::value ||
std::is_same<OCmp, ToP<EL2>>::value ||
std::is_same<OCmp, ToP<EL3>>::value)
};
};
template <bool b>
using Required = typename std::enable_if<b, bool>::type;
template <class L, class R>
using CheckCmp = Required<std::is_same<L, R>::value>;
template <class L, class R1, class R2>
using CheckCmp_2 =
Required<std::is_same<L, R1>::value || std::is_same<L, R2>::value>;
template <class OpType, template <class> class TPP, class Tup>
using Verify = Required<utils::DeepType<OpType, TPP, Tup>::value>;
} // namespace utils
} // namespace dgl
#endif // INTEL_META_UTILS_H_
...@@ -8,9 +8,13 @@ ...@@ -8,9 +8,13 @@
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/bcast.h> #include <dgl/bcast.h>
#include <limits>
#include <algorithm> #include <algorithm>
#include <limits>
#include <memory>
#include "spmm_binary_ops.h"
#if !defined(_WIN32)
#include "intel/cpu_support.h"
#endif
namespace dgl { namespace dgl {
namespace aten { namespace aten {
namespace cpu { namespace cpu {
...@@ -26,40 +30,62 @@ namespace cpu { ...@@ -26,40 +30,62 @@ namespace cpu {
* for the computation of different nodes. * for the computation of different nodes.
*/ */
template <typename IdType, typename DType, typename Op> template <typename IdType, typename DType, typename Op>
void SpMMSumCsr( void SpMMSumCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
const BcastOff& bcast, NDArray efeat, NDArray out) {
const CSRMatrix& csr,
NDArray ufeat, NDArray efeat,
NDArray out) {
const bool has_idx = !IsNullArray(csr.data); const bool has_idx = !IsNullArray(csr.data);
const IdType* indptr = csr.indptr.Ptr<IdType>(); const IdType* indptr = csr.indptr.Ptr<IdType>();
const IdType* indices = csr.indices.Ptr<IdType>(); const IdType* indices = csr.indices.Ptr<IdType>();
const IdType* edges = csr.data.Ptr<IdType>(); const IdType* edges = csr.data.Ptr<IdType>();
const DType* X = ufeat.Ptr<DType>(); const DType* X = ufeat.Ptr<DType>();
const DType* W = efeat.Ptr<DType>(); const DType* W = efeat.Ptr<DType>();
int64_t dim = bcast.out_len, int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
lhs_dim = bcast.lhs_len,
rhs_dim = bcast.rhs_len;
DType* O = out.Ptr<DType>(); DType* O = out.Ptr<DType>();
#if !defined(_WIN32)
typedef dgl::ElemWiseAddUpdate<Op> ElemWiseUpd;
/* Prepare an assembler kernel */
static std::unique_ptr<ElemWiseUpd> asm_kernel_ptr(
(dgl::IntelKernel<>::IsEnabled()) ? new ElemWiseUpd() : nullptr);
/* Distribute the kernel among OMP threads */
ElemWiseUpd* cpu_spec = (asm_kernel_ptr && asm_kernel_ptr->applicable())
? asm_kernel_ptr.get()
: nullptr;
if (cpu_spec && dim > 16 && !bcast.use_bcast) {
#pragma omp parallel for #pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) { for (IdType rid = 0; rid < csr.num_rows; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1]; const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType *out_off = O + rid * dim; DType* out_off = O + rid * dim;
std::fill(out_off, out_off + dim, 0); std::fill(out_off, out_off + dim, 0);
for (IdType j = row_start; j < row_end; ++j) { for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j]; const IdType cid = indices[j];
const IdType eid = has_idx ? edges[j] : j; const IdType eid = has_idx ? edges[j] : j;
for (int64_t k = 0; k < dim; ++k) { cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim);
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k; }
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k; }
const DType *lhs_off = } else {
#endif
#pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim;
std::fill(out_off, out_off + dim, 0);
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
const IdType eid = has_idx ? edges[j] : j;
for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off =
Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr; Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
const DType *rhs_off = const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr; Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
out_off[k] += Op::Call(lhs_off, rhs_off); out_off[k] += Op::Call(lhs_off, rhs_off);
}
} }
} }
#if !defined(_WIN32)
} }
#endif
} }
/*! /*!
...@@ -74,20 +100,15 @@ void SpMMSumCsr( ...@@ -74,20 +100,15 @@ void SpMMSumCsr(
* we use atomic operators in the reduction phase. * we use atomic operators in the reduction phase.
*/ */
template <typename IdType, typename DType, typename Op> template <typename IdType, typename DType, typename Op>
void SpMMSumCoo( void SpMMSumCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
const BcastOff& bcast, NDArray efeat, NDArray out) {
const COOMatrix& coo,
NDArray ufeat, NDArray efeat,
NDArray out) {
const bool has_idx = !IsNullArray(coo.data); const bool has_idx = !IsNullArray(coo.data);
const IdType* row = coo.row.Ptr<IdType>(); const IdType* row = coo.row.Ptr<IdType>();
const IdType* col = coo.col.Ptr<IdType>(); const IdType* col = coo.col.Ptr<IdType>();
const IdType* edges = coo.data.Ptr<IdType>(); const IdType* edges = coo.data.Ptr<IdType>();
const DType* X = ufeat.Ptr<DType>(); const DType* X = ufeat.Ptr<DType>();
const DType* W = efeat.Ptr<DType>(); const DType* W = efeat.Ptr<DType>();
int64_t dim = bcast.out_len, int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
lhs_dim = bcast.lhs_len,
rhs_dim = bcast.rhs_len;
DType* O = out.Ptr<DType>(); DType* O = out.Ptr<DType>();
const int64_t nnz = coo.row->shape[0]; const int64_t nnz = coo.row->shape[0];
// fill zero elements // fill zero elements
...@@ -97,13 +118,15 @@ void SpMMSumCoo( ...@@ -97,13 +118,15 @@ void SpMMSumCoo(
for (IdType i = 0; i < nnz; ++i) { for (IdType i = 0; i < nnz; ++i) {
const IdType rid = row[i]; const IdType rid = row[i];
const IdType cid = col[i]; const IdType cid = col[i];
const IdType eid = has_idx? edges[i] : i; const IdType eid = has_idx ? edges[i] : i;
DType* out_off = O + cid * dim; DType* out_off = O + cid * dim;
for (int64_t k = 0; k < dim; ++k) { for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k; const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k; const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off = Op::use_lhs? X + rid * lhs_dim + lhs_add : nullptr; const DType* lhs_off =
const DType* rhs_off = Op::use_rhs? W + eid * rhs_dim + rhs_add : nullptr; Op::use_lhs ? X + rid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
const DType val = Op::Call(lhs_off, rhs_off); const DType val = Op::Call(lhs_off, rhs_off);
if (val != 0) { if (val != 0) {
#pragma omp atomic #pragma omp atomic
...@@ -120,34 +143,31 @@ void SpMMSumCoo( ...@@ -120,34 +143,31 @@ void SpMMSumCoo(
* \param ufeat The feature on source nodes. * \param ufeat The feature on source nodes.
* \param efeat The feature on edges. * \param efeat The feature on edges.
* \param out The result feature on destination nodes. * \param out The result feature on destination nodes.
* \param argu Arg-Min/Max on source nodes, which refers the source node indices * \param argu Arg-Min/Max on source nodes, which refers the source node indices
* correspond to the minimum/maximum values of reduction result on * correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max reducer. * destination nodes. It's useful in computing gradients of Min/Max
* \param arge Arg-Min/Max on edges. which refers the source node indices * reducer. \param arge Arg-Min/Max on edges. which refers the source node
* correspond to the minimum/maximum values of reduction result on * indices correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max reducer. * destination nodes. It's useful in computing gradients of Min/Max
* \note It uses node parallel strategy, different threads are responsible * reducer. \note It uses node parallel strategy, different threads are
* for the computation of different nodes. * responsible for the computation of different nodes. \note The result will
* \note The result will contain infinity for zero-degree nodes. * contain infinity for zero-degree nodes.
*/ */
template <typename IdType, typename DType, typename Op, typename Cmp> template <typename IdType, typename DType, typename Op, typename Cmp>
void SpMMCmpCsr( void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
const BcastOff& bcast, NDArray efeat, NDArray out, NDArray argu, NDArray arge) {
const CSRMatrix& csr,
NDArray ufeat, NDArray efeat,
NDArray out, NDArray argu, NDArray arge) {
const bool has_idx = !IsNullArray(csr.data); const bool has_idx = !IsNullArray(csr.data);
const IdType* indptr = static_cast<IdType*>(csr.indptr->data); const IdType* indptr = static_cast<IdType*>(csr.indptr->data);
const IdType* indices = static_cast<IdType*>(csr.indices->data); const IdType* indices = static_cast<IdType*>(csr.indices->data);
const IdType* edges = has_idx ? static_cast<IdType*>(csr.data->data) : nullptr; const IdType* edges =
const DType* X = Op::use_lhs? static_cast<DType*>(ufeat->data) : nullptr; has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
const DType* W = Op::use_rhs? static_cast<DType*>(efeat->data) : nullptr; const DType* X = Op::use_lhs ? static_cast<DType*>(ufeat->data) : nullptr;
const int64_t dim = bcast.out_len, const DType* W = Op::use_rhs ? static_cast<DType*>(efeat->data) : nullptr;
lhs_dim = bcast.lhs_len, const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
rhs_dim = bcast.rhs_len; rhs_dim = bcast.rhs_len;
DType* O = static_cast<DType*>(out->data); DType* O = static_cast<DType*>(out->data);
IdType* argX = Op::use_lhs? static_cast<IdType*>(argu->data) : nullptr; IdType* argX = Op::use_lhs ? static_cast<IdType*>(argu->data) : nullptr;
IdType* argW = Op::use_rhs? static_cast<IdType*>(arge->data) : nullptr; IdType* argW = Op::use_rhs ? static_cast<IdType*>(arge->data) : nullptr;
#pragma omp parallel for #pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) { for (IdType rid = 0; rid < csr.num_rows; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1]; const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
...@@ -155,25 +175,23 @@ void SpMMCmpCsr( ...@@ -155,25 +175,23 @@ void SpMMCmpCsr(
IdType* argx_off = argX + rid * dim; IdType* argx_off = argX + rid * dim;
IdType* argw_off = argW + rid * dim; IdType* argw_off = argW + rid * dim;
std::fill(out_off, out_off + dim, Cmp::zero); std::fill(out_off, out_off + dim, Cmp::zero);
if (Op::use_lhs) if (Op::use_lhs) std::fill(argx_off, argx_off + dim, 0);
std::fill(argx_off, argx_off + dim, 0); if (Op::use_rhs) std::fill(argw_off, argw_off + dim, 0);
if (Op::use_rhs)
std::fill(argw_off, argw_off + dim, 0);
for (IdType j = row_start; j < row_end; ++j) { for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j]; const IdType cid = indices[j];
const IdType eid = has_idx? edges[j] : j; const IdType eid = has_idx ? edges[j] : j;
for (int64_t k = 0; k < dim; ++k) { for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k; const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k; const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off = Op::use_lhs? X + cid * lhs_dim + lhs_add : nullptr; const DType* lhs_off =
const DType* rhs_off = Op::use_rhs? W + eid * rhs_dim + rhs_add : nullptr; Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
const DType val = Op::Call(lhs_off, rhs_off); const DType val = Op::Call(lhs_off, rhs_off);
if (Cmp::Call(out_off[k], val)) { if (Cmp::Call(out_off[k], val)) {
out_off[k] = val; out_off[k] = val;
if (Op::use_lhs) if (Op::use_lhs) argx_off[k] = cid;
argx_off[k] = cid; if (Op::use_rhs) argw_off[k] = eid;
if (Op::use_rhs)
argw_off[k] = eid;
} }
} }
} }
...@@ -187,35 +205,32 @@ void SpMMCmpCsr( ...@@ -187,35 +205,32 @@ void SpMMCmpCsr(
* \param ufeat The feature on source nodes. * \param ufeat The feature on source nodes.
* \param efeat The feature on edges. * \param efeat The feature on edges.
* \param out The result feature on destination nodes. * \param out The result feature on destination nodes.
* \param argu Arg-Min/Max on source nodes, which refers the source node indices * \param argu Arg-Min/Max on source nodes, which refers the source node indices
* correspond to the minimum/maximum values of reduction result on * correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max reducer. * destination nodes. It's useful in computing gradients of Min/Max
* \param arge Arg-Min/Max on edges. which refers the source node indices * reducer. \param arge Arg-Min/Max on edges. which refers the source node
* correspond to the minimum/maximum values of reduction result on * indices correspond to the minimum/maximum values of reduction result on
* destination nodes. It's useful in computing gradients of Min/Max reducer. * destination nodes. It's useful in computing gradients of Min/Max
* \note it uses node parallel strategy, different threads are responsible * reducer. \note it uses node parallel strategy, different threads are
* for the computation of different nodes. To avoid possible data hazard, * responsible for the computation of different nodes. To avoid possible data
* we use atomic operators in the reduction phase. * hazard, we use atomic operators in the reduction phase. \note The result will
* \note The result will contain infinity for zero-degree nodes. * contain infinity for zero-degree nodes.
*/ */
template <typename IdType, typename DType, typename Op, typename Cmp> template <typename IdType, typename DType, typename Op, typename Cmp>
void SpMMCmpCoo( void SpMMCmpCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
const BcastOff& bcast, NDArray efeat, NDArray out, NDArray argu, NDArray arge) {
const COOMatrix& coo,
NDArray ufeat, NDArray efeat,
NDArray out, NDArray argu, NDArray arge) {
const bool has_idx = !IsNullArray(coo.data); const bool has_idx = !IsNullArray(coo.data);
const IdType* row = static_cast<IdType*>(coo.row->data); const IdType* row = static_cast<IdType*>(coo.row->data);
const IdType* col = static_cast<IdType*>(coo.col->data); const IdType* col = static_cast<IdType*>(coo.col->data);
const IdType* edges = has_idx? static_cast<IdType*>(coo.data->data) : nullptr; const IdType* edges =
const DType* X = Op::use_lhs? static_cast<DType*>(ufeat->data) : nullptr; has_idx ? static_cast<IdType*>(coo.data->data) : nullptr;
const DType* W = Op::use_rhs? static_cast<DType*>(efeat->data) : nullptr; const DType* X = Op::use_lhs ? static_cast<DType*>(ufeat->data) : nullptr;
const int64_t dim = bcast.out_len, const DType* W = Op::use_rhs ? static_cast<DType*>(efeat->data) : nullptr;
lhs_dim = bcast.lhs_len, const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
rhs_dim = bcast.rhs_len; rhs_dim = bcast.rhs_len;
DType* O = static_cast<DType*>(out->data); DType* O = static_cast<DType*>(out->data);
IdType* argX = Op::use_lhs? static_cast<IdType*>(argu->data) : nullptr; IdType* argX = Op::use_lhs ? static_cast<IdType*>(argu->data) : nullptr;
IdType* argW = Op::use_rhs? static_cast<IdType*>(arge->data) : nullptr; IdType* argW = Op::use_rhs ? static_cast<IdType*>(arge->data) : nullptr;
const int64_t nnz = coo.row->shape[0]; const int64_t nnz = coo.row->shape[0];
// fill zero elements // fill zero elements
std::fill(O, O + out.NumElements(), Cmp::zero); std::fill(O, O + out.NumElements(), Cmp::zero);
...@@ -224,145 +239,28 @@ void SpMMCmpCoo( ...@@ -224,145 +239,28 @@ void SpMMCmpCoo(
for (IdType i = 0; i < nnz; ++i) { for (IdType i = 0; i < nnz; ++i) {
const IdType rid = row[i]; const IdType rid = row[i];
const IdType cid = col[i]; const IdType cid = col[i];
const IdType eid = has_idx? edges[i] : i; const IdType eid = has_idx ? edges[i] : i;
DType* out_off = O + cid * dim; DType* out_off = O + cid * dim;
IdType* argx_off = Op::use_lhs? argX + cid * dim : nullptr; IdType* argx_off = Op::use_lhs ? argX + cid * dim : nullptr;
IdType* argw_off = Op::use_rhs? argW + cid * dim : nullptr; IdType* argw_off = Op::use_rhs ? argW + cid * dim : nullptr;
for (int64_t k = 0; k < dim; ++k) { for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k; const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k; const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off = Op::use_lhs? X + rid * lhs_dim + lhs_add : nullptr; const DType* lhs_off =
const DType* rhs_off = Op::use_rhs? W + eid * rhs_dim + rhs_add : nullptr; Op::use_lhs ? X + rid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
const DType val = Op::Call(lhs_off, rhs_off); const DType val = Op::Call(lhs_off, rhs_off);
#pragma omp critical #pragma omp critical
if (Cmp::Call(out_off[k], val)) { if (Cmp::Call(out_off[k], val)) {
out_off[k] = val; out_off[k] = val;
if (Op::use_lhs) if (Op::use_lhs) argx_off[k] = rid;
argx_off[k] = rid; if (Op::use_rhs) argw_off[k] = eid;
if (Op::use_rhs)
argw_off[k] = eid;
} }
} }
} }
} }
namespace op {
//////////////////////////////// binary operators on CPU ////////////////////////////////
template <typename DType>
struct Add {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
return *lhs_off + *rhs_off;
}
};
template <typename DType> constexpr bool Add<DType>::use_lhs;
template <typename DType> constexpr bool Add<DType>::use_rhs;
template <typename DType>
struct Sub {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
return *lhs_off - *rhs_off;
}
};
template <typename DType> constexpr bool Sub<DType>::use_lhs;
template <typename DType> constexpr bool Sub<DType>::use_rhs;
template <typename DType>
struct Mul {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
return *lhs_off * *rhs_off;
}
};
template <typename DType> constexpr bool Mul<DType>::use_lhs;
template <typename DType> constexpr bool Mul<DType>::use_rhs;
template <typename DType>
struct Div {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
return *lhs_off / *rhs_off;
}
};
template <typename DType> constexpr bool Div<DType>::use_lhs;
template <typename DType> constexpr bool Div<DType>::use_rhs;
template <typename DType>
struct CopyLhs {
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = false;
inline static DType Call(const DType* lhs_off, const DType* ) {
return *lhs_off;
}
};
template <typename DType> constexpr bool CopyLhs<DType>::use_lhs;
template <typename DType> constexpr bool CopyLhs<DType>::use_rhs;
template <typename DType>
struct CopyRhs {
static constexpr bool use_lhs = false;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* , const DType* rhs_off) {
return *rhs_off;
}
};
template <typename DType> constexpr bool CopyRhs<DType>::use_lhs;
template <typename DType> constexpr bool CopyRhs<DType>::use_rhs;
//////////////////////////////// Reduce operators on CPU ////////////////////////////////
template <typename DType>
struct Max {
static constexpr DType zero = -std::numeric_limits<DType>::infinity();
// return true if accum should be replaced
inline static DType Call(DType accum, DType val) {
return accum < val;
}
};
template <typename DType> constexpr DType Max<DType>::zero;
template <typename DType>
struct Min {
static constexpr DType zero = std::numeric_limits<DType>::infinity();
// return true if accum should be replaced
inline static DType Call(DType accum, DType val) {
return accum > val;
}
};
template <typename DType> constexpr DType Min<DType>::zero;
#define SWITCH_OP(op, Op, ...) \
do { \
if ((op) == "add") { \
typedef dgl::aten::cpu::op::Add<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "sub") { \
typedef dgl::aten::cpu::op::Sub<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "mul") { \
typedef dgl::aten::cpu::op::Mul<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "div") { \
typedef dgl::aten::cpu::op::Div<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "copy_lhs") { \
typedef dgl::aten::cpu::op::CopyLhs<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "copy_rhs") { \
typedef dgl::aten::cpu::op::CopyRhs<DType> Op; \
{ __VA_ARGS__ } \
} else { \
LOG(FATAL) << "Unsupported SpMM binary operator: " << op; \
} \
} while (0)
} // namespace op
} // namespace cpu } // namespace cpu
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cpu/spmm_binary_ops.h
* \brief SPMM CPU Binary ops.
*/
#ifndef DGL_ARRAY_CPU_SPMM_BINARY_OPS_H_
#define DGL_ARRAY_CPU_SPMM_BINARY_OPS_H_
#include <dgl/array.h>
#include <dgl/bcast.h>
#include <limits>
namespace dgl {
namespace aten {
namespace cpu {
namespace op {
//////////////////////////////// binary operators on CPU
///////////////////////////////////
template <typename DType>
struct Add {
typedef DType type;
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
return *lhs_off + *rhs_off;
}
};
template <typename DType>
constexpr bool Add<DType>::use_lhs;
template <typename DType>
constexpr bool Add<DType>::use_rhs;
template <typename DType>
struct Sub {
typedef DType type;
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
return *lhs_off - *rhs_off;
}
};
template <typename DType>
constexpr bool Sub<DType>::use_lhs;
template <typename DType>
constexpr bool Sub<DType>::use_rhs;
template <typename DType>
struct Mul {
typedef DType type;
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
return *lhs_off * *rhs_off;
}
};
template <typename DType>
constexpr bool Mul<DType>::use_lhs;
template <typename DType>
constexpr bool Mul<DType>::use_rhs;
template <typename DType>
struct Div {
typedef DType type;
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = true;
inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
return *lhs_off / *rhs_off;
}
};
template <typename DType>
constexpr bool Div<DType>::use_lhs;
template <typename DType>
constexpr bool Div<DType>::use_rhs;
template <typename DType>
struct CopyLhs {
typedef DType type;
static constexpr bool use_lhs = true;
static constexpr bool use_rhs = false;
inline static DType Call(const DType* lhs_off, const DType*) {
return *lhs_off;
}
};
template <typename DType>
constexpr bool CopyLhs<DType>::use_lhs;
template <typename DType>
constexpr bool CopyLhs<DType>::use_rhs;
template <typename DType>
struct CopyRhs {
typedef DType type;
static constexpr bool use_lhs = false;
static constexpr bool use_rhs = true;
inline static DType Call(const DType*, const DType* rhs_off) {
return *rhs_off;
}
};
template <typename DType>
constexpr bool CopyRhs<DType>::use_lhs;
template <typename DType>
constexpr bool CopyRhs<DType>::use_rhs;
//////////////////////////////// Reduce operators on CPU
///////////////////////////////////
template <typename DType>
struct Max {
typedef DType type;
static constexpr DType zero = -std::numeric_limits<DType>::infinity();
// return true if accum should be replaced
inline static DType Call(DType accum, DType val) { return accum < val; }
};
template <typename DType>
constexpr DType Max<DType>::zero;
template <typename DType>
struct Min {
typedef DType type;
static constexpr DType zero = std::numeric_limits<DType>::infinity();
// return true if accum should be replaced
inline static DType Call(DType accum, DType val) { return accum > val; }
};
template <typename DType>
constexpr DType Min<DType>::zero;
#define SWITCH_OP(op, Op, ...) \
do { \
if ((op) == "add") { \
typedef dgl::aten::cpu::op::Add<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "sub") { \
typedef dgl::aten::cpu::op::Sub<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "mul") { \
typedef dgl::aten::cpu::op::Mul<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "div") { \
typedef dgl::aten::cpu::op::Div<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "copy_lhs") { \
typedef dgl::aten::cpu::op::CopyLhs<DType> Op; \
{ __VA_ARGS__ } \
} else if ((op) == "copy_rhs") { \
typedef dgl::aten::cpu::op::CopyRhs<DType> Op; \
{ __VA_ARGS__ } \
} else { \
LOG(FATAL) << "Unsupported SpMM binary operator: " << op; \
} \
} while (0)
} // namespace op
} // namespace cpu
} // namespace aten
} // namespace dgl
#endif // DGL_ARRAY_CPU_SPMM_BINARY_OPS_H_
#if !defined(_WIN32)
#include <../../src/array/cpu/spmm.h>
#include <dgl/array.h>
#include <gtest/gtest.h>
#include <time.h>
#include <random>
#include "./common.h"
using namespace dgl;
using namespace dgl::runtime;
int sizes[] = {1, 7, 8, 9, 31, 32, 33, 54, 63, 64, 65, 256, 257};
namespace ns_op = dgl::aten::cpu::op;
namespace {
template <class T>
void GenerateData(T* data, int dim, T mul) {
for (int i = 0; i < dim; i++) {
data[i] = (i + 1) * mul;
}
}
template <class T>
void GenerateRandomData(T* data, int dim) {
std::mt19937 rng(std::random_device{}());
std::uniform_int_distribution<> dist(0, 10000);
for (int i = 0; i < dim; i++) {
data[i] = (dist(rng) / 100);
}
}
template <class T>
void GenerateZeroData(T* data, int dim) {
for (int i = 0; i < dim; i++) {
data[i] = 0;
}
}
template <class T>
void Copy(T* exp, T* out, T* hs, int dim) {
for (int i = 0; i < dim; i++) {
exp[i] = out[i] + hs[i];
}
}
template <class T>
void Add(T* exp, T* out, T* lhs, T* rhs, int dim) {
for (int i = 0; i < dim; i++) {
exp[i] = out[i] + lhs[i] + rhs[i];
}
}
template <class T>
void Sub(T* exp, T* out, T* lhs, T* rhs, int dim) {
for (int i = 0; i < dim; i++) {
exp[i] = out[i] + lhs[i] - rhs[i];
}
}
template <class T>
void Mul(T* exp, T* out, T* lhs, T* rhs, int dim) {
for (int i = 0; i < dim; i++) {
exp[i] = (out[i] + (lhs[i] * rhs[i]));
}
}
template <class T>
void Div(T* exp, T* out, T* lhs, T* rhs, int dim) {
for (int i = 0; i < dim; i++) {
exp[i] = (out[i] + (lhs[i] / rhs[i]));
}
}
template <class T>
void CheckResult(T* exp, T* out, T* out_intel_kernel, int dim) {
for (int i = 0; i < dim; i++) {
ASSERT_TRUE(exp[i] == out[i]);
if (out_intel_kernel != nullptr) {
ASSERT_TRUE(out[i] == out_intel_kernel[i]);
}
}
}
} // namespace
template <class ElemWiseUpd>
ElemWiseUpd* generic_ElemWiseUpd() {
static std::unique_ptr<ElemWiseUpd> asm_kernel_ptr(
(dgl::IntelKernel<>::IsEnabled()) ? new ElemWiseUpd() : nullptr);
ElemWiseUpd* cpu_spec = (asm_kernel_ptr && asm_kernel_ptr->applicable())
? asm_kernel_ptr.get()
: nullptr;
return cpu_spec;
}
template <typename IDX>
void _TestSpmmCopyLhs() {
for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) {
int dim = sizes[i];
IDX out[dim], exp[dim], lhs[dim];
GenerateZeroData(out, dim);
GenerateRandomData(lhs, dim);
// Calculation of expected output - 'exp'
Copy(exp, out, lhs, dim);
// Calculation of output using legacy path - 'out'
for (int k = 0; k < dim; k++) {
out[k] += ns_op::CopyLhs<IDX>::Call(lhs + k, nullptr);
}
// Calculation of output using intel path - 'out_intel_kernel'
auto* cpu_spec =
generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::CopyLhs<IDX>>>();
if (cpu_spec) {
IDX out_intel_kernel[dim];
GenerateZeroData(out_intel_kernel, dim);
cpu_spec->run(out_intel_kernel, lhs, nullptr, dim);
CheckResult(exp, out, out_intel_kernel, dim);
} else {
IDX* out_intel_kernel = nullptr;
CheckResult(exp, out, out_intel_kernel, dim);
}
}
}
TEST(SpmmTest, TestSpmmCopyLhs) {
_TestSpmmCopyLhs<float>();
_TestSpmmCopyLhs<double>();
}
template <typename IDX>
void _TestSpmmCopyRhs() {
for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) {
int dim = sizes[i];
IDX out[dim], exp[dim], rhs[dim];
GenerateZeroData(out, dim);
GenerateRandomData(rhs, dim);
// Calculation of expected output - 'exp'
Copy(exp, out, rhs, dim);
// Calculation of output using legacy path - 'out'
for (int k = 0; k < dim; k++) {
out[k] += ns_op::CopyRhs<IDX>::Call(nullptr, rhs + k);
}
// Calculation of output using intel path - 'out_intel_kernel'
auto* cpu_spec =
generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::CopyRhs<IDX>>>();
if (cpu_spec) {
IDX out_intel_kernel[dim];
GenerateZeroData(out_intel_kernel, dim);
cpu_spec->run(out_intel_kernel, nullptr, rhs, dim);
CheckResult(exp, out, out_intel_kernel, dim);
} else {
IDX* out_intel_kernel = nullptr;
CheckResult(exp, out, out_intel_kernel, dim);
}
}
}
TEST(SpmmTest, TestSpmmCopyRhs) {
_TestSpmmCopyRhs<float>();
_TestSpmmCopyRhs<double>();
}
template <typename IDX>
void _TestSpmmAdd() {
for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) {
int dim = sizes[i];
IDX out[dim], exp[dim], lhs[dim], rhs[dim];
GenerateZeroData(out, dim);
GenerateRandomData(lhs, dim);
GenerateRandomData(rhs, dim);
// Calculation of expected output - 'exp'
Add(exp, out, lhs, rhs, dim);
// Calculation of output using legacy path - 'out'
for (int k = 0; k < dim; k++) {
out[k] += ns_op::Add<IDX>::Call(lhs + k, rhs + k);
}
// Calculation of output using intel path - 'out_intel_kernel'
auto* cpu_spec =
generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Add<IDX>>>();
if (cpu_spec) {
IDX out_intel_kernel[dim];
GenerateZeroData(out_intel_kernel, dim);
cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
CheckResult(exp, out, out_intel_kernel, dim);
} else {
IDX* out_intel_kernel = nullptr;
CheckResult(exp, out, out_intel_kernel, dim);
}
}
}
TEST(SpmmTest, TestSpmmAdd) {
_TestSpmmAdd<float>();
_TestSpmmAdd<double>();
}
template <typename IDX>
void _TestSpmmSub() {
for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) {
int dim = sizes[i];
IDX out[dim], exp[dim], lhs[dim], rhs[dim];
GenerateZeroData(out, dim);
GenerateRandomData(lhs, dim);
GenerateRandomData(rhs, dim);
// Calculation of expected output - 'exp'
Sub(exp, out, lhs, rhs, dim);
// Calculation of output using legacy path - 'out'
for (int k = 0; k < dim; k++) {
out[k] += ns_op::Sub<IDX>::Call(lhs + k, rhs + k);
}
// Calculation of output using intel path - 'out_intel_kernel'
auto* cpu_spec =
generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Sub<IDX>>>();
if (cpu_spec) {
IDX out_intel_kernel[dim];
GenerateZeroData(out_intel_kernel, dim);
cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
CheckResult(exp, out, out_intel_kernel, dim);
} else {
IDX* out_intel_kernel = nullptr;
CheckResult(exp, out, out_intel_kernel, dim);
}
}
}
TEST(SpmmTest, TestSpmmSub) {
_TestSpmmSub<float>();
_TestSpmmSub<double>();
}
template <typename IDX>
void _TestSpmmMul() {
for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) {
int dim = sizes[i];
IDX out[dim], exp[dim], lhs[dim], rhs[dim];
GenerateZeroData(out, dim);
GenerateRandomData(lhs, dim);
GenerateRandomData(rhs, dim);
// Calculation of expected output - 'exp'
Mul(exp, out, lhs, rhs, dim);
// Calculation of output using legacy path - 'out'
for (int k = 0; k < dim; k++) {
out[k] += ns_op::Mul<IDX>::Call(lhs + k, rhs + k);
}
// Calculation of output using intel path - 'out_intel_kernel'
auto* cpu_spec =
generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Mul<IDX>>>();
if (cpu_spec) {
IDX out_intel_kernel[dim];
GenerateZeroData(out_intel_kernel, dim);
cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
CheckResult(exp, out, out_intel_kernel, dim);
} else {
IDX* out_intel_kernel = nullptr;
CheckResult(exp, out, out_intel_kernel, dim);
}
}
}
TEST(SpmmTest, TestSpmmMul) {
_TestSpmmMul<float>();
_TestSpmmMul<double>();
}
template <typename IDX>
void _TestSpmmDiv() {
for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) {
int dim = sizes[i];
IDX out[dim], exp[dim], lhs[dim], rhs[dim];
GenerateZeroData(out, dim);
GenerateData(lhs, dim, (IDX)15);
GenerateData(rhs, dim, (IDX)1);
// Calculation of expected output - 'exp'
Div(exp, out, lhs, rhs, dim);
// Calculation of output using legacy path - 'out'
for (int k = 0; k < dim; k++) {
out[k] += ns_op::Div<IDX>::Call(lhs + k, rhs + k);
}
// Calculation of output using intel path - 'out_intel_kernel'
auto* cpu_spec =
generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Div<IDX>>>();
if (cpu_spec) {
IDX out_intel_kernel[dim];
GenerateZeroData(out_intel_kernel, dim);
cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
CheckResult(exp, out, out_intel_kernel, dim);
} else {
IDX* out_intel_kernel = nullptr;
CheckResult(exp, out, out_intel_kernel, dim);
}
}
}
TEST(SpmmTest, TestSpmmDiv) {
_TestSpmmDiv<float>();
_TestSpmmDiv<double>();
}
#endif
Subproject commit 0140eeff1fffcf5069dea3abb57095695320971c
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment