Commit da2c1226 authored by lucasb-eyer's avatar lucasb-eyer
Browse files

Initial import of code.

parent 3ae01ffb
FILE(GLOB Eigen_Core_arch_NEON_SRCS "*.h")
INSTALL(FILES
${Eigen_Core_arch_NEON_SRCS}
DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/NEON COMPONENT Devel
)
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_NEON_H
#define EIGEN_COMPLEX_NEON_H
namespace Eigen {
namespace internal {
static uint32x4_t p4ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET4(0x00000000, 0x80000000, 0x00000000, 0x80000000);
static uint32x2_t p2ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x00000000, 0x80000000);
//---------- float ----------
struct Packet2cf
{
EIGEN_STRONG_INLINE Packet2cf() {}
EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
Packet4f v;
};
template<> struct packet_traits<std::complex<float> > : default_packet_traits
{
typedef Packet2cf type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 2,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
};
};
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
float32x2_t r64;
r64 = vld1_f32((float *)&from);
return Packet2cf(vcombine_f32(r64, r64));
}
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
{
Packet4ui b = vreinterpretq_u32_f32(a.v);
return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR)));
}
template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
Packet4f v1, v2;
float32x2_t a_lo, a_hi;
// Get the real values of a | a1_re | a1_re | a2_re | a2_re |
v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0));
// Get the real values of a | a1_im | a1_im | a2_im | a2_im |
v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1));
// Multiply the real a with b
v1 = vmulq_f32(v1, b.v);
// Multiply the imag a with b
v2 = vmulq_f32(v2, b.v);
// Conjugate v2
v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR));
// Swap real/imag elements in v2.
a_lo = vrev64_f32(vget_low_f32(v2));
a_hi = vrev64_f32(vget_high_f32(v2));
v2 = vcombine_f32(a_lo, a_hi);
// Add and return the result
return Packet2cf(vaddq_f32(v1, v2));
}
template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
}
template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
}
template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
}
template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
}
template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { __pld((float *)addr); }
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{
std::complex<float> EIGEN_ALIGN16 x[2];
vst1q_f32((float *)x, a.v);
return x[0];
}
template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
{
float32x2_t a_lo, a_hi;
Packet4f a_r128;
a_lo = vget_low_f32(a.v);
a_hi = vget_high_f32(a.v);
a_r128 = vcombine_f32(a_hi, a_lo);
return Packet2cf(a_r128);
}
template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a)
{
return Packet2cf(vrev64q_f32(a.v));
}
template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
{
float32x2_t a1, a2;
std::complex<float> s;
a1 = vget_low_f32(a.v);
a2 = vget_high_f32(a.v);
a2 = vadd_f32(a1, a2);
vst1_f32((float *)&s, a2);
return s;
}
template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
{
Packet4f sum1, sum2, sum;
// Add the first two 64-bit float32x2_t of vecs[0]
sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v));
sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v));
sum = vaddq_f32(sum1, sum2);
return Packet2cf(sum);
}
template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
{
float32x2_t a1, a2, v1, v2, prod;
std::complex<float> s;
a1 = vget_low_f32(a.v);
a2 = vget_high_f32(a.v);
// Get the real values of a | a1_re | a1_re | a2_re | a2_re |
v1 = vdup_lane_f32(a1, 0);
// Get the real values of a | a1_im | a1_im | a2_im | a2_im |
v2 = vdup_lane_f32(a1, 1);
// Multiply the real a with b
v1 = vmul_f32(v1, a2);
// Multiply the imag a with b
v2 = vmul_f32(v2, a2);
// Conjugate v2
v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR));
// Swap real/imag elements in v2.
v2 = vrev64_f32(v2);
// Add v1, v2
prod = vadd_f32(v1, v2);
vst1_f32((float *)&s, prod);
return s;
}
template<int Offset>
struct palign_impl<Offset,Packet2cf>
{
EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second)
{
if (Offset==1)
{
first.v = vextq_f32(first.v, second.v, 2);
}
}
};
template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
{
return internal::pmul(a, pconj(b));
}
};
template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
{
return internal::pmul(pconj(a), b);
}
};
template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
{
return pconj(internal::pmul(a, b));
}
};
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
// TODO optimize it for AltiVec
Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
Packet4f s, rev_s;
float32x2_t a_lo, a_hi;
// this computes the norm
s = vmulq_f32(b.v, b.v);
a_lo = vrev64_f32(vget_low_f32(s));
a_hi = vrev64_f32(vget_high_f32(s));
rev_s = vcombine_f32(a_lo, a_hi);
return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_COMPLEX_NEON_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr>
// Heavily based on Gael's SSE version.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_NEON_H
#define EIGEN_PACKET_MATH_NEON_H
namespace Eigen {
namespace internal {
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
// FIXME NEON has 16 quad registers, but since the current register allocator
// is so bad, it is much better to reduce it to 8
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
#endif
typedef float32x4_t Packet4f;
typedef int32x4_t Packet4i;
typedef uint32x4_t Packet4ui;
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f p4f_##NAME = pset1<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
#if defined(__llvm__) && !defined(__clang__)
//Special treatment for Apple's llvm-gcc, its NEON packet types are unions
#define EIGEN_INIT_NEON_PACKET2(X, Y) {{X, Y}}
#define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
#else
//Default initializer for packets
#define EIGEN_INIT_NEON_PACKET2(X, Y) {X, Y}
#define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
#endif
#ifndef __pld
#define __pld(x) asm volatile ( " pld [%[addr]]\n" :: [addr] "r" (x) : "cc" );
#endif
template<> struct packet_traits<float> : default_packet_traits
{
typedef Packet4f type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 4,
HasDiv = 1,
// FIXME check the Has*
HasSin = 0,
HasCos = 0,
HasLog = 0,
HasExp = 0,
HasSqrt = 0
};
};
template<> struct packet_traits<int> : default_packet_traits
{
typedef Packet4i type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=4
// FIXME check the Has*
};
};
#if EIGEN_GNUC_AT_MOST(4,4) && !defined(__llvm__)
// workaround gcc 4.2, 4.3 and 4.4 compilatin issue
EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
#endif
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a)
{
Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
return vaddq_f32(pset1<Packet4f>(a), countdown);
}
template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)
{
Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
return vaddq_s32(pset1<Packet4i>(a), countdown);
}
template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
{
Packet4f inv, restep, div;
// NEON does not offer a divide instruction, we have to do a reciprocal approximation
// However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
// a reciprocal estimate AND a reciprocal step -which saves a few instructions
// vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
// Newton-Raphson and vrecpsq_f32()
inv = vrecpeq_f32(b);
// This returns a differential, by which we will have to multiply inv to get a better
// approximation of 1/b.
restep = vrecpsq_f32(b, inv);
inv = vmulq_f32(restep, inv);
// Finally, multiply a by 1/b and get the wanted result of the division.
div = vmulq_f32(a, inv);
return div;
}
template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
{ eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet4i>(0);
}
// for some weird raisons, it has to be overloaded for packet of integers
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
{
return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
{
return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
{
return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
{
return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
{
float32x2_t lo, hi;
lo = vdup_n_f32(*from);
hi = vdup_n_f32(*(from+1));
return vcombine_f32(lo, hi);
}
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
{
int32x2_t lo, hi;
lo = vdup_n_s32(*from);
hi = vdup_n_s32(*(from+1));
return vcombine_s32(lo, hi);
}
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { __pld(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { __pld(addr); }
// FIXME only store the 2 first elements ?
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
float32x2_t a_lo, a_hi;
Packet4f a_r64;
a_r64 = vrev64q_f32(a);
a_lo = vget_low_f32(a_r64);
a_hi = vget_high_f32(a_r64);
return vcombine_f32(a_hi, a_lo);
}
template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
int32x2_t a_lo, a_hi;
Packet4i a_r64;
a_r64 = vrev64q_s32(a);
a_lo = vget_low_s32(a_r64);
a_hi = vget_high_s32(a_r64);
return vcombine_s32(a_hi, a_lo);
}
template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
{
float32x2_t a_lo, a_hi, sum;
float s[2];
a_lo = vget_low_f32(a);
a_hi = vget_high_f32(a);
sum = vpadd_f32(a_lo, a_hi);
sum = vpadd_f32(sum, sum);
vst1_f32(s, sum);
return s[0];
}
template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
{
float32x4x2_t vtrn1, vtrn2, res1, res2;
Packet4f sum1, sum2, sum;
// NEON zip performs interleaving of the supplied vectors.
// We perform two interleaves in a row to acquire the transposed vector
vtrn1 = vzipq_f32(vecs[0], vecs[2]);
vtrn2 = vzipq_f32(vecs[1], vecs[3]);
res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
// Do the addition of the resulting vectors
sum1 = vaddq_f32(res1.val[0], res1.val[1]);
sum2 = vaddq_f32(res2.val[0], res2.val[1]);
sum = vaddq_f32(sum1, sum2);
return sum;
}
template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
{
int32x2_t a_lo, a_hi, sum;
int32_t s[2];
a_lo = vget_low_s32(a);
a_hi = vget_high_s32(a);
sum = vpadd_s32(a_lo, a_hi);
sum = vpadd_s32(sum, sum);
vst1_s32(s, sum);
return s[0];
}
template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
{
int32x4x2_t vtrn1, vtrn2, res1, res2;
Packet4i sum1, sum2, sum;
// NEON zip performs interleaving of the supplied vectors.
// We perform two interleaves in a row to acquire the transposed vector
vtrn1 = vzipq_s32(vecs[0], vecs[2]);
vtrn2 = vzipq_s32(vecs[1], vecs[3]);
res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
// Do the addition of the resulting vectors
sum1 = vaddq_s32(res1.val[0], res1.val[1]);
sum2 = vaddq_s32(res2.val[0], res2.val[1]);
sum = vaddq_s32(sum1, sum2);
return sum;
}
// Other reduction functions:
// mul
template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
{
float32x2_t a_lo, a_hi, prod;
float s[2];
// Get a_lo = |a1|a2| and a_hi = |a3|a4|
a_lo = vget_low_f32(a);
a_hi = vget_high_f32(a);
// Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
prod = vmul_f32(a_lo, a_hi);
// Multiply prod with its swapped value |a2*a4|a1*a3|
prod = vmul_f32(prod, vrev64_f32(prod));
vst1_f32(s, prod);
return s[0];
}
template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
{
int32x2_t a_lo, a_hi, prod;
int32_t s[2];
// Get a_lo = |a1|a2| and a_hi = |a3|a4|
a_lo = vget_low_s32(a);
a_hi = vget_high_s32(a);
// Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
prod = vmul_s32(a_lo, a_hi);
// Multiply prod with its swapped value |a2*a4|a1*a3|
prod = vmul_s32(prod, vrev64_s32(prod));
vst1_s32(s, prod);
return s[0];
}
// min
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
{
float32x2_t a_lo, a_hi, min;
float s[2];
a_lo = vget_low_f32(a);
a_hi = vget_high_f32(a);
min = vpmin_f32(a_lo, a_hi);
min = vpmin_f32(min, min);
vst1_f32(s, min);
return s[0];
}
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
{
int32x2_t a_lo, a_hi, min;
int32_t s[2];
a_lo = vget_low_s32(a);
a_hi = vget_high_s32(a);
min = vpmin_s32(a_lo, a_hi);
min = vpmin_s32(min, min);
vst1_s32(s, min);
return s[0];
}
// max
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
{
float32x2_t a_lo, a_hi, max;
float s[2];
a_lo = vget_low_f32(a);
a_hi = vget_high_f32(a);
max = vpmax_f32(a_lo, a_hi);
max = vpmax_f32(max, max);
vst1_f32(s, max);
return s[0];
}
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
{
int32x2_t a_lo, a_hi, max;
int32_t s[2];
a_lo = vget_low_s32(a);
a_hi = vget_high_s32(a);
max = vpmax_s32(a_lo, a_hi);
max = vpmax_s32(max, max);
vst1_s32(s, max);
return s[0];
}
// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
#define PALIGN_NEON(Offset,Type,Command) \
template<>\
struct palign_impl<Offset,Type>\
{\
EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
{\
if (Offset!=0)\
first = Command(first, second, Offset);\
}\
};\
PALIGN_NEON(0,Packet4f,vextq_f32)
PALIGN_NEON(1,Packet4f,vextq_f32)
PALIGN_NEON(2,Packet4f,vextq_f32)
PALIGN_NEON(3,Packet4f,vextq_f32)
PALIGN_NEON(0,Packet4i,vextq_s32)
PALIGN_NEON(1,Packet4i,vextq_s32)
PALIGN_NEON(2,Packet4i,vextq_s32)
PALIGN_NEON(3,Packet4i,vextq_s32)
#undef PALIGN_NEON
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_PACKET_MATH_NEON_H
FILE(GLOB Eigen_Core_arch_SSE_SRCS "*.h")
INSTALL(FILES
${Eigen_Core_arch_SSE_SRCS}
DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/SSE COMPONENT Devel
)
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_SSE_H
#define EIGEN_COMPLEX_SSE_H
namespace Eigen {
namespace internal {
//---------- float ----------
struct Packet2cf
{
EIGEN_STRONG_INLINE Packet2cf() {}
EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
__m128 v;
};
template<> struct packet_traits<std::complex<float> > : default_packet_traits
{
typedef Packet2cf type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 2,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
};
};
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
{
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
return Packet2cf(_mm_xor_ps(a.v,mask));
}
template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
{
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
return Packet2cf(_mm_xor_ps(a.v,mask));
}
template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
// TODO optimize it for SSE3 and 4
#ifdef EIGEN_VECTORIZE_SSE3
return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
_mm_mul_ps(_mm_movehdup_ps(a.v),
vec4f_swizzle1(b.v, 1, 0, 3, 2))));
// return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
// _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
// vec4f_swizzle1(b.v, 1, 0, 3, 2))));
#else
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000));
return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
#endif
}
template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&real_ref(*from))); }
template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&real_ref(*from))); }
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
Packet2cf res;
#if EIGEN_GNUC_AT_MOST(4,2)
// workaround annoying "may be used uninitialized in this function" warning with gcc 4.2
res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
#else
res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
#endif
return Packet2cf(_mm_movelh_ps(res.v,res.v));
}
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&real_ref(*to), from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&real_ref(*to), from.v); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{
#if EIGEN_GNUC_AT_MOST(4,3)
// Workaround gcc 4.2 ICE - this is not performance wise ideal, but who cares...
// This workaround also fix invalid code generation with gcc 4.3
EIGEN_ALIGN16 std::complex<float> res[2];
_mm_store_ps((float*)res, a.v);
return res[0];
#else
std::complex<float> res;
_mm_storel_pi((__m64*)&res, a.v);
return res;
#endif
}
template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); }
template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
{
return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v))));
}
template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
{
return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v)));
}
template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
{
return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
}
template<int Offset>
struct palign_impl<Offset,Packet2cf>
{
static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
{
if (Offset==1)
{
first.v = _mm_movehl_ps(first.v, first.v);
first.v = _mm_movelh_ps(first.v, second.v);
}
}
};
template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
{
#ifdef EIGEN_VECTORIZE_SSE3
return internal::pmul(a, pconj(b));
#else
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
vec4f_swizzle1(b.v, 1, 0, 3, 2))));
#endif
}
};
template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
{
#ifdef EIGEN_VECTORIZE_SSE3
return internal::pmul(pconj(a), b);
#else
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
#endif
}
};
template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
{
#ifdef EIGEN_VECTORIZE_SSE3
return pconj(internal::pmul(a, b));
#else
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
vec4f_swizzle1(b.v, 1, 0, 3, 2))));
#endif
}
};
template<> struct conj_helper<Packet4f, Packet2cf, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
{ return Packet2cf(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
{ return Packet2cf(Eigen::internal::pmul(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
// TODO optimize it for SSE3 and 4
Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
__m128 s = _mm_mul_ps(b.v,b.v);
return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1)))));
}
EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
{
return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
}
//---------- double ----------
struct Packet1cd
{
EIGEN_STRONG_INLINE Packet1cd() {}
EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
__m128d v;
};
template<> struct packet_traits<std::complex<double> > : default_packet_traits
{
typedef Packet1cd type;
enum {
Vectorizable = 1,
AlignedOnScalar = 0,
size = 1,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
};
};
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; };
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
{
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
return Packet1cd(_mm_xor_pd(a.v,mask));
}
template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
// TODO optimize it for SSE3 and 4
#ifdef EIGEN_VECTORIZE_SSE3
return Packet1cd(_mm_addsub_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
vec2d_swizzle1(b.v, 1, 0))));
#else
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
vec2d_swizzle1(b.v, 1, 0)), mask)));
#endif
}
template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
// FIXME force unaligned load, this is a temporary fix
template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from)
{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
// FIXME force unaligned store, this is a temporary fix
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
{
EIGEN_ALIGN16 double res[2];
_mm_store_pd(res, a.v);
return std::complex<double>(res[0],res[1]);
}
template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
{
return pfirst(a);
}
template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
{
return vecs[0];
}
template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
{
return pfirst(a);
}
template<int Offset>
struct palign_impl<Offset,Packet1cd>
{
static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
{
// FIXME is it sure we never have to align a Packet1cd?
// Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
}
};
template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
{
#ifdef EIGEN_VECTORIZE_SSE3
return internal::pmul(a, pconj(b));
#else
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
vec2d_swizzle1(b.v, 1, 0))));
#endif
}
};
template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
{
#ifdef EIGEN_VECTORIZE_SSE3
return internal::pmul(pconj(a), b);
#else
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
vec2d_swizzle1(b.v, 1, 0)), mask)));
#endif
}
};
template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
{
#ifdef EIGEN_VECTORIZE_SSE3
return pconj(internal::pmul(a, b));
#else
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
vec2d_swizzle1(b.v, 1, 0))));
#endif
}
};
template<> struct conj_helper<Packet2d, Packet1cd, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
{ return Packet1cd(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
{ return Packet1cd(Eigen::internal::pmul(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
// TODO optimize it for SSE3 and 4
Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
__m128d s = _mm_mul_pd(b.v,b.v);
return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
}
EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
{
return Packet1cd(preverse(x.v));
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_COMPLEX_SSE_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2007 Julien Pommier
// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/* The sin, cos, exp, and log functions of this file come from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
#ifndef EIGEN_MATH_FUNCTIONS_SSE_H
#define EIGEN_MATH_FUNCTIONS_SSE_H
namespace Eigen {
namespace internal {
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f plog<Packet4f>(const Packet4f& _x)
{
Packet4f x = _x;
_EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
_EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
/* the smallest non denormalized float number */
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f);
/* natural logarithm computed for 4 simultaneous float
return NaN for x <= 0
*/
_EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
Packet4i emm0;
Packet4f invalid_mask = _mm_cmplt_ps(x, _mm_setzero_ps());
Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */
emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
/* keep only the fractional part */
x = _mm_and_ps(x, p4f_inv_mant_mask);
x = _mm_or_ps(x, p4f_half);
emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1);
/* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
Packet4f tmp = _mm_and_ps(x, mask);
x = psub(x, p4f_1);
e = psub(e, _mm_and_ps(p4f_1, mask));
x = padd(x, tmp);
Packet4f x2 = pmul(x,x);
Packet4f x3 = pmul(x2,x);
Packet4f y, y1, y2;
y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
y = pmadd(y , x, p4f_cephes_log_p2);
y1 = pmadd(y1, x, p4f_cephes_log_p5);
y2 = pmadd(y2, x, p4f_cephes_log_p8);
y = pmadd(y, x3, y1);
y = pmadd(y, x3, y2);
y = pmul(y, x3);
y1 = pmul(e, p4f_cephes_log_q1);
tmp = pmul(x2, p4f_half);
y = padd(y, y1);
x = psub(x, tmp);
y2 = pmul(e, p4f_cephes_log_q2);
x = padd(x, y);
x = padd(x, y2);
// negative arg will be NAN, 0 will be -INF
return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)),
_mm_and_ps(iszero_mask, p4f_minus_inf));
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f pexp<Packet4f>(const Packet4f& _x)
{
Packet4f x = _x;
_EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
_EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
_EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
_EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
Packet4f tmp = _mm_setzero_ps(), fx;
Packet4i emm0;
// clamp x
x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
/* express exp(x) as exp(g + n*log(2)) */
fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
/* how to perform a floorf with SSE: just below */
emm0 = _mm_cvttps_epi32(fx);
tmp = _mm_cvtepi32_ps(emm0);
/* if greater, substract 1 */
Packet4f mask = _mm_cmpgt_ps(tmp, fx);
mask = _mm_and_ps(mask, p4f_1);
fx = psub(tmp, mask);
tmp = pmul(fx, p4f_cephes_exp_C1);
Packet4f z = pmul(fx, p4f_cephes_exp_C2);
x = psub(x, tmp);
x = psub(x, z);
z = pmul(x,x);
Packet4f y = p4f_cephes_exp_p0;
y = pmadd(y, x, p4f_cephes_exp_p1);
y = pmadd(y, x, p4f_cephes_exp_p2);
y = pmadd(y, x, p4f_cephes_exp_p3);
y = pmadd(y, x, p4f_cephes_exp_p4);
y = pmadd(y, x, p4f_cephes_exp_p5);
y = pmadd(y, z, x);
y = padd(y, p4f_1);
// build 2^n
emm0 = _mm_cvttps_epi32(fx);
emm0 = _mm_add_epi32(emm0, p4i_0x7f);
emm0 = _mm_slli_epi32(emm0, 23);
return pmul(y, _mm_castsi128_ps(emm0));
}
/* evaluation of 4 sines at onces, using SSE2 intrinsics.
The code is the exact rewriting of the cephes sinf function.
Precision is excellent as long as x < 8192 (I did not bother to
take into account the special handling they have for greater values
-- it does not return garbage for arguments over 8192, though, but
the extra precision is missing).
Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
surprising but correct result.
*/
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f psin<Packet4f>(const Packet4f& _x)
{
Packet4f x = _x;
_EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
_EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet4i(1, 1);
_EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
_EIGEN_DECLARE_CONST_Packet4i(2, 2);
_EIGEN_DECLARE_CONST_Packet4i(4, 4);
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
_EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
_EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
_EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
_EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
_EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
_EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
Packet4i emm0, emm2;
sign_bit = x;
/* take the absolute value */
x = pabs(x);
/* take the modulo */
/* extract the sign bit (upper one) */
sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
/* scale by 4/Pi */
y = pmul(x, p4f_cephes_FOPI);
/* store the integer part of y in mm0 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, p4i_1);
emm2 = _mm_and_si128(emm2, p4i_not1);
y = _mm_cvtepi32_ps(emm2);
/* get the swap sign flag */
emm0 = _mm_and_si128(emm2, p4i_4);
emm0 = _mm_slli_epi32(emm0, 29);
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
and another one for Pi/4<x<=Pi/2
Both branches will be computed.
*/
emm2 = _mm_and_si128(emm2, p4i_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
Packet4f poly_mask = _mm_castsi128_ps(emm2);
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = pmul(y, p4f_minus_cephes_DP1);
xmm2 = pmul(y, p4f_minus_cephes_DP2);
xmm3 = pmul(y, p4f_minus_cephes_DP3);
x = padd(x, xmm1);
x = padd(x, xmm2);
x = padd(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = p4f_coscof_p0;
Packet4f z = _mm_mul_ps(x,x);
y = pmadd(y, z, p4f_coscof_p1);
y = pmadd(y, z, p4f_coscof_p2);
y = pmul(y, z);
y = pmul(y, z);
Packet4f tmp = pmul(z, p4f_half);
y = psub(y, tmp);
y = padd(y, p4f_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
Packet4f y2 = p4f_sincof_p0;
y2 = pmadd(y2, z, p4f_sincof_p1);
y2 = pmadd(y2, z, p4f_sincof_p2);
y2 = pmul(y2, z);
y2 = pmul(y2, x);
y2 = padd(y2, x);
/* select the correct result from the two polynoms */
y2 = _mm_and_ps(poly_mask, y2);
y = _mm_andnot_ps(poly_mask, y);
y = _mm_or_ps(y,y2);
/* update the sign */
return _mm_xor_ps(y, sign_bit);
}
/* almost the same as psin */
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f pcos<Packet4f>(const Packet4f& _x)
{
Packet4f x = _x;
_EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
_EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet4i(1, 1);
_EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
_EIGEN_DECLARE_CONST_Packet4i(2, 2);
_EIGEN_DECLARE_CONST_Packet4i(4, 4);
_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
_EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
_EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
_EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
_EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
_EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
_EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
Packet4i emm0, emm2;
x = pabs(x);
/* scale by 4/Pi */
y = pmul(x, p4f_cephes_FOPI);
/* get the integer part of y */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, p4i_1);
emm2 = _mm_and_si128(emm2, p4i_not1);
y = _mm_cvtepi32_ps(emm2);
emm2 = _mm_sub_epi32(emm2, p4i_2);
/* get the swap sign flag */
emm0 = _mm_andnot_si128(emm2, p4i_4);
emm0 = _mm_slli_epi32(emm0, 29);
/* get the polynom selection mask */
emm2 = _mm_and_si128(emm2, p4i_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
Packet4f sign_bit = _mm_castsi128_ps(emm0);
Packet4f poly_mask = _mm_castsi128_ps(emm2);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = pmul(y, p4f_minus_cephes_DP1);
xmm2 = pmul(y, p4f_minus_cephes_DP2);
xmm3 = pmul(y, p4f_minus_cephes_DP3);
x = padd(x, xmm1);
x = padd(x, xmm2);
x = padd(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = p4f_coscof_p0;
Packet4f z = pmul(x,x);
y = pmadd(y,z,p4f_coscof_p1);
y = pmadd(y,z,p4f_coscof_p2);
y = pmul(y, z);
y = pmul(y, z);
Packet4f tmp = _mm_mul_ps(z, p4f_half);
y = psub(y, tmp);
y = padd(y, p4f_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
Packet4f y2 = p4f_sincof_p0;
y2 = pmadd(y2, z, p4f_sincof_p1);
y2 = pmadd(y2, z, p4f_sincof_p2);
y2 = pmul(y2, z);
y2 = pmadd(y2, x, x);
/* select the correct result from the two polynoms */
y2 = _mm_and_ps(poly_mask, y2);
y = _mm_andnot_ps(poly_mask, y);
y = _mm_or_ps(y,y2);
/* update the sign */
return _mm_xor_ps(y, sign_bit);
}
// This is based on Quake3's fast inverse square root.
// For detail see here: http://www.beyond3d.com/content/articles/8/
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f psqrt<Packet4f>(const Packet4f& _x)
{
Packet4f half = pmul(_x, pset1<Packet4f>(.5f));
/* select only the inverse sqrt of non-zero inputs */
Packet4f non_zero_mask = _mm_cmpgt_ps(_x, pset1<Packet4f>(std::numeric_limits<float>::epsilon()));
Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x))));
return pmul(_x,x);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_SSE_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_SSE_H
#define EIGEN_PACKET_MATH_SSE_H
namespace Eigen {
namespace internal {
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif
typedef __m128 Packet4f;
typedef __m128i Packet4i;
typedef __m128d Packet2d;
template<> struct is_arithmetic<__m128> { enum { value = true }; };
template<> struct is_arithmetic<__m128i> { enum { value = true }; };
template<> struct is_arithmetic<__m128d> { enum { value = true }; };
#define vec4f_swizzle1(v,p,q,r,s) \
(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
#define vec4i_swizzle1(v,p,q,r,s) \
(_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
#define vec2d_swizzle1(v,p,q) \
(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
#define vec4f_swizzle2(a,b,p,q,r,s) \
(_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
#define vec4i_swizzle2(a,b,p,q,r,s) \
(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f p4f_##NAME = pset1<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
template<> struct packet_traits<float> : default_packet_traits
{
typedef Packet4f type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=4,
HasDiv = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
HasSqrt = 1
};
};
template<> struct packet_traits<double> : default_packet_traits
{
typedef Packet2d type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=2,
HasDiv = 1
};
};
template<> struct packet_traits<int> : default_packet_traits
{
typedef Packet4i type;
enum {
// FIXME check the Has*
Vectorizable = 1,
AlignedOnScalar = 1,
size=4
};
};
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
#if defined(_MSC_VER) && (_MSC_VER==1500)
// Workaround MSVC 9 internal compiler error.
// TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
// TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)).
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps(from,from,from,from); }
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); }
#else
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set1_ps(from); }
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
#endif
template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
{
const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
return _mm_xor_ps(a,mask);
}
template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
{
const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
return _mm_xor_pd(a,mask);
}
template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
{
return psub(_mm_setr_epi32(0,0,0,0), a);
}
template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
{
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_mullo_epi32(a,b);
#else
// this version is slightly faster than 4 scalar products
return vec4i_swizzle1(
vec4i_swizzle2(
_mm_mul_epu32(a,b),
_mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
vec4i_swizzle1(b,1,0,3,2)),
0,2,0,2),
0,2,1,3);
#endif
}
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
{ eigen_assert(false && "packet integer division are not supported by SSE");
return pset1<Packet4i>(0);
}
// for some weird raisons, it has to be overloaded for packet of integers
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
{
// after some bench, this version *is* faster than a scalar implementation
Packet4i mask = _mm_cmplt_epi32(a,b);
return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
}
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
{
// after some bench, this version *is* faster than a scalar implementation
Packet4i mask = _mm_cmpgt_epi32(a,b);
return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
}
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
#if defined(_MSC_VER)
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
EIGEN_DEBUG_UNALIGNED_LOAD
#if (_MSC_VER==1600)
// NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
// (i.e., it does not generate an unaligned load!!
// TODO On most architectures this version should also be faster than a single _mm_loadu_ps
// so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so...
__m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
res = _mm_loadh_pi(res, (const __m64*)(from+2));
return res;
#else
return _mm_loadu_ps(from);
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
#else
// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
// require pointer casting to incompatible pointer types and leads to invalid code
// because of the strict aliasing rule. The "dummy" stuff are required to enforce
// a correct instruction dependency.
// TODO: do the same for MSVC (ICC is compatible)
// NOTE: with the code below, MSVC's compiler crashes!
#if defined(__GNUC__) && defined(__i386__)
// bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
#elif defined(__clang__)
// bug 201: Segfaults in __mm_loadh_pd with clang 2.8
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
#else
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
#endif
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
{
EIGEN_DEBUG_UNALIGNED_LOAD
#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
return _mm_loadu_ps(from);
#else
__m128d res;
res = _mm_load_sd((const double*)(from)) ;
res = _mm_loadh_pd(res, (const double*)(from+2)) ;
return _mm_castpd_ps(res);
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
{
EIGEN_DEBUG_UNALIGNED_LOAD
#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
return _mm_loadu_pd(from);
#else
__m128d res;
res = _mm_load_sd(from) ;
res = _mm_loadh_pd(res,from+1);
return res;
#endif
}
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
{
EIGEN_DEBUG_UNALIGNED_LOAD
#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from));
#else
__m128d res;
res = _mm_load_sd((const double*)(from)) ;
res = _mm_loadh_pd(res, (const double*)(from+2)) ;
return _mm_castpd_si128(res);
#endif
}
#endif
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
{
return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
}
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
{ return pset1<Packet2d>(from[0]); }
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
{
Packet4i tmp;
tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
return vec4i_swizzle1(tmp, 0, 0, 1, 1);
}
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
EIGEN_DEBUG_UNALIGNED_STORE
_mm_storel_pd((to), from);
_mm_storeh_pd((to+1), from);
}
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); }
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); }
// some compilers might be tempted to perform multiple moves instead of using a vector path.
template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
{
Packet4f pa = _mm_set_ss(a);
pstore(to, vec4f_swizzle1(pa,0,0,0,0));
}
// some compilers might be tempted to perform multiple moves instead of using a vector path.
template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
{
Packet2d pa = _mm_set_sd(a);
pstore(to, vec2d_swizzle1(pa,0,0));
}
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
// Direct of the struct members fixed bug #62.
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
#elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
#else
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
#endif
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
{ return _mm_shuffle_ps(a,a,0x1B); }
template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
{ return _mm_shuffle_pd(a,a,0x1); }
template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
{ return _mm_shuffle_epi32(a,0x1B); }
template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
{
const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
return _mm_and_ps(a,mask);
}
template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
{
const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
return _mm_and_pd(a,mask);
}
template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
{
#ifdef EIGEN_VECTORIZE_SSSE3
return _mm_abs_epi32(a);
#else
Packet4i aux = _mm_srai_epi32(a,31);
return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
#endif
}
EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
{
vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
}
#ifdef EIGEN_VECTORIZE_SSE3
// TODO implement SSE2 versions as well as integer versions
template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
{
return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
}
template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
{
return _mm_hadd_pd(vecs[0], vecs[1]);
}
// SSSE3 version:
// EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs)
// {
// return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
// }
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
{
Packet4f tmp0 = _mm_hadd_ps(a,a);
return pfirst(_mm_hadd_ps(tmp0, tmp0));
}
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); }
// SSSE3 version:
// EIGEN_STRONG_INLINE float predux(const Packet4i& a)
// {
// Packet4i tmp0 = _mm_hadd_epi32(a,a);
// return pfirst(_mm_hadd_epi32(tmp0, tmp0));
// }
#else
// SSE2 versions
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
{
Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
}
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
{
return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
}
template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
{
Packet4f tmp0, tmp1, tmp2;
tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
tmp0 = _mm_add_ps(tmp0, tmp1);
tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
tmp1 = _mm_add_ps(tmp1, tmp2);
tmp2 = _mm_movehl_ps(tmp1, tmp0);
tmp0 = _mm_movelh_ps(tmp0, tmp1);
return _mm_add_ps(tmp0, tmp2);
}
template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
{
return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
}
#endif // SSE3
template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
{
Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1));
}
template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
{
Packet4i tmp0, tmp1, tmp2;
tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
tmp0 = _mm_add_epi32(tmp0, tmp1);
tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
tmp1 = _mm_add_epi32(tmp1, tmp2);
tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
return _mm_add_epi32(tmp0, tmp2);
}
// Other reduction functions:
// mul
template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
{
Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
}
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
{
return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
}
template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
{
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., reusing pmul is very slow !)
// TODO try to call _mm_mul_epu32 directly
EIGEN_ALIGN16 int aux[4];
pstore(aux, a);
return (aux[0] * aux[1]) * (aux[2] * aux[3]);;
}
// min
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
{
Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
}
template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
{
return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
}
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
{
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!)
EIGEN_ALIGN16 int aux[4];
pstore(aux, a);
register int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
register int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
return aux0<aux2 ? aux0 : aux2;
}
// max
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
{
Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
}
template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
{
return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
}
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
{
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!)
EIGEN_ALIGN16 int aux[4];
pstore(aux, a);
register int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
register int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
return aux0>aux2 ? aux0 : aux2;
}
#if (defined __GNUC__)
// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
// {
// Packet4f res = b;
// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
// return res;
// }
// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i)
// {
// Packet4i res = a;
// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
// return res;
// }
#endif
#ifdef EIGEN_VECTORIZE_SSSE3
// SSSE3 versions
template<int Offset>
struct palign_impl<Offset,Packet4f>
{
static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
{
if (Offset!=0)
first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
}
};
template<int Offset>
struct palign_impl<Offset,Packet4i>
{
static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
{
if (Offset!=0)
first = _mm_alignr_epi8(second,first, Offset*4);
}
};
template<int Offset>
struct palign_impl<Offset,Packet2d>
{
static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
{
if (Offset==1)
first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
}
};
#else
// SSE2 versions
template<int Offset>
struct palign_impl<Offset,Packet4f>
{
static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
{
if (Offset==1)
{
first = _mm_move_ss(first,second);
first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
}
else if (Offset==2)
{
first = _mm_movehl_ps(first,first);
first = _mm_movelh_ps(first,second);
}
else if (Offset==3)
{
first = _mm_move_ss(first,second);
first = _mm_shuffle_ps(first,second,0x93);
}
}
};
template<int Offset>
struct palign_impl<Offset,Packet4i>
{
static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
{
if (Offset==1)
{
first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
first = _mm_shuffle_epi32(first,0x39);
}
else if (Offset==2)
{
first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
}
else if (Offset==3)
{
first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
}
}
};
template<int Offset>
struct palign_impl<Offset,Packet2d>
{
static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
{
if (Offset==1)
{
first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
}
}
};
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_PACKET_MATH_SSE_H
FILE(GLOB Eigen_Core_Product_SRCS "*.h")
INSTALL(FILES
${Eigen_Core_Product_SRCS}
DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/products COMPONENT Devel
)
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COEFFBASED_PRODUCT_H
#define EIGEN_COEFFBASED_PRODUCT_H
namespace Eigen {
namespace internal {
/*********************************************************************************
* Coefficient based product implementation.
* It is designed for the following use cases:
* - small fixed sizes
* - lazy products
*********************************************************************************/
/* Since the all the dimensions of the product are small, here we can rely
* on the generic Assign mechanism to evaluate the product per coeff (or packet).
*
* Note that here the inner-loops should always be unrolled.
*/
template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
struct product_coeff_impl;
template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct product_packet_impl;
template<typename LhsNested, typename RhsNested, int NestingFlags>
struct traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
{
typedef MatrixXpr XprKind;
typedef typename remove_all<LhsNested>::type _LhsNested;
typedef typename remove_all<RhsNested>::type _RhsNested;
typedef typename scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
typedef typename promote_storage_type<typename traits<_LhsNested>::StorageKind,
typename traits<_RhsNested>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<typename traits<_LhsNested>::Index,
typename traits<_RhsNested>::Index>::type Index;
enum {
LhsCoeffReadCost = _LhsNested::CoeffReadCost,
RhsCoeffReadCost = _RhsNested::CoeffReadCost,
LhsFlags = _LhsNested::Flags,
RhsFlags = _RhsNested::Flags,
RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
LhsRowMajor = LhsFlags & RowMajorBit,
RhsRowMajor = RhsFlags & RowMajorBit,
SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
&& (ColsAtCompileTime == Dynamic
|| ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0
&& (RhsFlags&AlignedBit)
)
),
CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
&& (RowsAtCompileTime == Dynamic
|| ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0
&& (LhsFlags&AlignedBit)
)
),
EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
: (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
: (RhsRowMajor && !CanVectorizeLhs),
Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
| (EvalToRowMajor ? RowMajorBit : 0)
| NestingFlags
| (LhsFlags & RhsFlags & AlignedBit)
// TODO enable vectorization for mixed types
| (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
CoeffReadCost = InnerSize == Dynamic ? Dynamic
: InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
+ (InnerSize - 1) * NumTraits<Scalar>::AddCost,
/* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
* of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
* loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
* the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
*/
CanVectorizeInner = SameType
&& LhsRowMajor
&& (!RhsRowMajor)
&& (LhsFlags & RhsFlags & ActualPacketAccessBit)
&& (LhsFlags & RhsFlags & AlignedBit)
&& (InnerSize % packet_traits<Scalar>::size == 0)
};
};
} // end namespace internal
template<typename LhsNested, typename RhsNested, int NestingFlags>
class CoeffBasedProduct
: internal::no_assignment_operator,
public MatrixBase<CoeffBasedProduct<LhsNested, RhsNested, NestingFlags> >
{
public:
typedef MatrixBase<CoeffBasedProduct> Base;
EIGEN_DENSE_PUBLIC_INTERFACE(CoeffBasedProduct)
typedef typename Base::PlainObject PlainObject;
private:
typedef typename internal::traits<CoeffBasedProduct>::_LhsNested _LhsNested;
typedef typename internal::traits<CoeffBasedProduct>::_RhsNested _RhsNested;
enum {
PacketSize = internal::packet_traits<Scalar>::size,
InnerSize = internal::traits<CoeffBasedProduct>::InnerSize,
Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
CanVectorizeInner = internal::traits<CoeffBasedProduct>::CanVectorizeInner
};
typedef internal::product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
Unroll ? InnerSize-1 : Dynamic,
_LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;
typedef CoeffBasedProduct<LhsNested,RhsNested,NestByRefBit> LazyCoeffBasedProductType;
public:
inline CoeffBasedProduct(const CoeffBasedProduct& other)
: Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs)
{}
template<typename Lhs, typename Rhs>
inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs)
: m_lhs(lhs), m_rhs(rhs)
{
// we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
// We still allow to mix T and complex<T>.
EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
eigen_assert(lhs.cols() == rhs.rows()
&& "invalid matrix product"
&& "if you wanted a coeff-wise or a dot product use the respective explicit functions");
}
EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
{
Scalar res;
ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
return res;
}
/* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
* which is why we don't set the LinearAccessBit.
*/
EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
{
Scalar res;
const Index row = RowsAtCompileTime == 1 ? 0 : index;
const Index col = RowsAtCompileTime == 1 ? index : 0;
ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
return res;
}
template<int LoadMode>
EIGEN_STRONG_INLINE const PacketScalar packet(Index row, Index col) const
{
PacketScalar res;
internal::product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
Unroll ? InnerSize-1 : Dynamic,
_LhsNested, _RhsNested, PacketScalar, LoadMode>
::run(row, col, m_lhs, m_rhs, res);
return res;
}
// Implicit conversion to the nested type (trigger the evaluation of the product)
EIGEN_STRONG_INLINE operator const PlainObject& () const
{
m_result.lazyAssign(*this);
return m_result;
}
const _LhsNested& lhs() const { return m_lhs; }
const _RhsNested& rhs() const { return m_rhs; }
const Diagonal<const LazyCoeffBasedProductType,0> diagonal() const
{ return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
template<int DiagonalIndex>
const Diagonal<const LazyCoeffBasedProductType,DiagonalIndex> diagonal() const
{ return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
const Diagonal<const LazyCoeffBasedProductType,Dynamic> diagonal(Index index) const
{ return reinterpret_cast<const LazyCoeffBasedProductType&>(*this).diagonal(index); }
protected:
typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
mutable PlainObject m_result;
};
namespace internal {
// here we need to overload the nested rule for products
// such that the nested type is a const reference to a plain matrix
template<typename Lhs, typename Rhs, int N, typename PlainObject>
struct nested<CoeffBasedProduct<Lhs,Rhs,EvalBeforeNestingBit|EvalBeforeAssigningBit>, N, PlainObject>
{
typedef PlainObject const& type;
};
/***************************************************************************
* Normal product .coeff() implementation (with meta-unrolling)
***************************************************************************/
/**************************************
*** Scalar path - no vectorization ***
**************************************/
template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
struct product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
{
product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col);
}
};
template<typename Lhs, typename Rhs, typename RetScalar>
struct product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
{
res = lhs.coeff(row, 0) * rhs.coeff(0, col);
}
};
template<typename Lhs, typename Rhs, typename RetScalar>
struct product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
{
eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
res = lhs.coeff(row, 0) * rhs.coeff(0, col);
for(Index i = 1; i < lhs.cols(); ++i)
res += lhs.coeff(row, i) * rhs.coeff(i, col);
}
};
/*******************************************
*** Scalar path with inner vectorization ***
*******************************************/
template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
struct product_coeff_vectorized_unroller
{
typedef typename Lhs::Index Index;
enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
{
product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
}
};
template<typename Lhs, typename Rhs, typename Packet>
struct product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
{
pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
}
};
template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
{
typedef typename Lhs::PacketScalar Packet;
typedef typename Lhs::Index Index;
enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
{
Packet pres;
product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res);
res = predux(pres);
}
};
template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
struct product_coeff_vectorized_dyn_selector
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
{
res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
}
};
// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
template<typename Lhs, typename Rhs, int RhsCols>
struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
{
res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
}
};
template<typename Lhs, typename Rhs, int LhsRows>
struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
{
res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
}
};
template<typename Lhs, typename Rhs>
struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
{
res = lhs.transpose().cwiseProduct(rhs).sum();
}
};
template<typename Lhs, typename Rhs, typename RetScalar>
struct product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar>
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
{
product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
}
};
/*******************
*** Packet path ***
*******************/
template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
{
product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
res = pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
}
};
template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
{
product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
res = pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
}
};
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
{
res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
}
};
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
{
res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
}
};
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
{
eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
for(Index i = 1; i < lhs.cols(); ++i)
res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
}
};
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
{
typedef typename Lhs::Index Index;
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
{
eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
for(Index i = 1; i < lhs.cols(); ++i)
res = pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
}
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_COEFFBASED_PRODUCT_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
#define EIGEN_GENERAL_BLOCK_PANEL_H
namespace Eigen {
namespace internal {
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
class gebp_traits;
/** \internal \returns b if a<=0, and returns a otherwise. */
inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
{
return a<=0 ? b : a;
}
/** \internal */
inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
{
static std::ptrdiff_t m_l1CacheSize = 0;
static std::ptrdiff_t m_l2CacheSize = 0;
if(m_l2CacheSize==0)
{
m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024);
m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024);
}
if(action==SetAction)
{
// set the cpu cache size and cache all block sizes from a global cache size in byte
eigen_internal_assert(l1!=0 && l2!=0);
m_l1CacheSize = *l1;
m_l2CacheSize = *l2;
}
else if(action==GetAction)
{
eigen_internal_assert(l1!=0 && l2!=0);
*l1 = m_l1CacheSize;
*l2 = m_l2CacheSize;
}
else
{
eigen_internal_assert(false);
}
}
/** \brief Computes the blocking parameters for a m x k times k x n matrix product
*
* \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
* \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
* \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
*
* Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
* this function computes the blocking size parameters along the respective dimensions
* for matrix products and related algorithms. The blocking sizes depends on various
* parameters:
* - the L1 and L2 cache sizes,
* - the register level blocking sizes defined by gebp_traits,
* - the number of scalars that fit into a packet (when vectorization is enabled).
*
* \sa setCpuCacheSizes */
template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
{
EIGEN_UNUSED_VARIABLE(n);
// Explanations:
// Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
// mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
// per kc x nr vertical small panels where nr is the blocking size along the n dimension
// at the register level. For vectorization purpose, these small vertical panels are unpacked,
// e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
// stay in L1 cache.
std::ptrdiff_t l1, l2;
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
enum {
kdiv = KcFactor * 2 * Traits::nr
* Traits::RhsProgress * sizeof(RhsScalar),
mr = gebp_traits<LhsScalar,RhsScalar>::mr,
mr_mask = (0xffffffff/mr)*mr
};
manage_caching_sizes(GetAction, &l1, &l2);
k = std::min<SizeType>(k, l1/kdiv);
SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
if(_m<m) m = _m & mr_mask;
}
template<typename LhsScalar, typename RhsScalar, typename SizeType>
inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
{
computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
}
#ifdef EIGEN_HAS_FUSE_CJMADD
#define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
#else
// FIXME (a bit overkill maybe ?)
template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
{
c = cj.pmadd(a,b,c);
}
};
template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
{
t = b; t = cj.pmul(a,t); c = padd(c,t);
}
};
template<typename CJ, typename A, typename B, typename C, typename T>
EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
{
gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
}
#define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
#endif
/* Vectorization logic
* real*real: unpack rhs to constant packets, ...
*
* cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
* storing each res packet into two packets (2x2),
* at the end combine them: swap the second and addsub them
* cf*cf : same but with 2x4 blocks
* cplx*real : unpack rhs to constant packets, ...
* real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
*/
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
class gebp_traits
{
public:
typedef _LhsScalar LhsScalar;
typedef _RhsScalar RhsScalar;
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
enum {
ConjLhs = _ConjLhs,
ConjRhs = _ConjRhs,
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
// register block size along the N direction (must be either 2 or 4)
nr = NumberOfRegisters/4,
// register block size along the M direction (currently, this one cannot be modified)
mr = 2 * LhsPacketSize,
WorkSpaceFactor = nr * RhsPacketSize,
LhsProgress = LhsPacketSize,
RhsProgress = RhsPacketSize
};
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
typedef typename packet_traits<ResScalar>::type _ResPacket;
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
typedef ResPacket AccPacket;
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
{
p = pset1<ResPacket>(ResScalar(0));
}
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
{
for(DenseIndex k=0; k<n; k++)
pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
}
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
{
dest = pload<RhsPacket>(b);
}
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
{
dest = pload<LhsPacket>(a);
}
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
{
tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
}
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
{
r = pmadd(c,alpha,r);
}
protected:
// conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
// conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
};
template<typename RealScalar, bool _ConjLhs>
class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
{
public:
typedef std::complex<RealScalar> LhsScalar;
typedef RealScalar RhsScalar;
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
enum {
ConjLhs = _ConjLhs,
ConjRhs = false,
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
nr = NumberOfRegisters/4,
mr = 2 * LhsPacketSize,
WorkSpaceFactor = nr*RhsPacketSize,
LhsProgress = LhsPacketSize,
RhsProgress = RhsPacketSize
};
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
typedef typename packet_traits<ResScalar>::type _ResPacket;
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
typedef ResPacket AccPacket;
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
{
p = pset1<ResPacket>(ResScalar(0));
}
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
{
for(DenseIndex k=0; k<n; k++)
pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
}
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
{
dest = pload<RhsPacket>(b);
}
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
{
dest = pload<LhsPacket>(a);
}
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
{
madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
}
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
{
tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
}
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
{
c += a * b;
}
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
{
r = cj.pmadd(c,alpha,r);
}
protected:
conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
};
template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
{
public:
typedef std::complex<RealScalar> Scalar;
typedef std::complex<RealScalar> LhsScalar;
typedef std::complex<RealScalar> RhsScalar;
typedef std::complex<RealScalar> ResScalar;
enum {
ConjLhs = _ConjLhs,
ConjRhs = _ConjRhs,
Vectorizable = packet_traits<RealScalar>::Vectorizable
&& packet_traits<Scalar>::Vectorizable,
RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
nr = 2,
mr = 2 * ResPacketSize,
WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
LhsProgress = ResPacketSize,
RhsProgress = Vectorizable ? 2*ResPacketSize : 1
};
typedef typename packet_traits<RealScalar>::type RealPacket;
typedef typename packet_traits<Scalar>::type ScalarPacket;
struct DoublePacket
{
RealPacket first;
RealPacket second;
};
typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket;
typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket;
EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
EIGEN_STRONG_INLINE void initAcc(DoublePacket& p)
{
p.first = pset1<RealPacket>(RealScalar(0));
p.second = pset1<RealPacket>(RealScalar(0));
}
/* Unpack the rhs coeff such that each complex coefficient is spread into
* two packects containing respectively the real and imaginary coefficient
* duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y]
*/
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b)
{
for(DenseIndex k=0; k<n; k++)
{
if(Vectorizable)
{
pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k]));
pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k]));
}
else
b[k] = rhs[k];
}
}
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; }
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
{
dest.first = pload<RealPacket>((const RealScalar*)b);
dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize));
}
// nothing special here
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
{
dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
}
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const
{
c.first = padd(pmul(a,b.first), c.first);
c.second = padd(pmul(a,b.second),c.second);
}
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
{
c = cj.pmadd(a,b,c);
}
EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const
{
// assemble c
ResPacket tmp;
if((!ConjLhs)&&(!ConjRhs))
{
tmp = pcplxflip(pconj(ResPacket(c.second)));
tmp = padd(ResPacket(c.first),tmp);
}
else if((!ConjLhs)&&(ConjRhs))
{
tmp = pconj(pcplxflip(ResPacket(c.second)));
tmp = padd(ResPacket(c.first),tmp);
}
else if((ConjLhs)&&(!ConjRhs))
{
tmp = pcplxflip(ResPacket(c.second));
tmp = padd(pconj(ResPacket(c.first)),tmp);
}
else if((ConjLhs)&&(ConjRhs))
{
tmp = pcplxflip(ResPacket(c.second));
tmp = psub(pconj(ResPacket(c.first)),tmp);
}
r = pmadd(tmp,alpha,r);
}
protected:
conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
};
template<typename RealScalar, bool _ConjRhs>
class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
{
public:
typedef std::complex<RealScalar> Scalar;
typedef RealScalar LhsScalar;
typedef Scalar RhsScalar;
typedef Scalar ResScalar;
enum {
ConjLhs = false,
ConjRhs = _ConjRhs,
Vectorizable = packet_traits<RealScalar>::Vectorizable
&& packet_traits<Scalar>::Vectorizable,
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
nr = 4,
mr = 2*ResPacketSize,
WorkSpaceFactor = nr*RhsPacketSize,
LhsProgress = ResPacketSize,
RhsProgress = ResPacketSize
};
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
typedef typename packet_traits<ResScalar>::type _ResPacket;
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
typedef ResPacket AccPacket;
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
{
p = pset1<ResPacket>(ResScalar(0));
}
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
{
for(DenseIndex k=0; k<n; k++)
pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
}
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
{
dest = pload<RhsPacket>(b);
}
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
{
dest = ploaddup<LhsPacket>(a);
}
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
{
madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
}
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
{
tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
}
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
{
c += a * b;
}
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
{
r = cj.pmadd(alpha,c,r);
}
protected:
conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
};
/* optimized GEneral packed Block * packed Panel product kernel
*
* Mixing type logic: C += A * B
* | A | B | comments
* |real |cplx | no vectorization yet, would require to pack A with duplication
* |cplx |real | easy vectorization
*/
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel
{
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
typedef typename Traits::ResScalar ResScalar;
typedef typename Traits::LhsPacket LhsPacket;
typedef typename Traits::RhsPacket RhsPacket;
typedef typename Traits::ResPacket ResPacket;
typedef typename Traits::AccPacket AccPacket;
enum {
Vectorizable = Traits::Vectorizable,
LhsProgress = Traits::LhsProgress,
RhsProgress = Traits::RhsProgress,
ResPacketSize = Traits::ResPacketSize
};
EIGEN_DONT_INLINE EIGEN_FLATTEN_ATTRIB
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0)
{
Traits traits;
if(strideA==-1) strideA = depth;
if(strideB==-1) strideB = depth;
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
// conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
Index packet_cols = (cols/nr) * nr;
const Index peeled_mc = (rows/mr)*mr;
// FIXME:
const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
const Index peeled_kc = (depth/4)*4;
if(unpackedB==0)
unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress);
// loops on each micro vertical panel of rhs (depth x nr)
for(Index j2=0; j2<packet_cols; j2+=nr)
{
traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB);
// loops on each largest micro horizontal panel of lhs (mr x depth)
// => we select a mr x nr micro block of res which is entirely
// stored into mr/packet_size x nr registers.
for(Index i=0; i<peeled_mc; i+=mr)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
prefetch(&blA[0]);
// gets res block as register
AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
traits.initAcc(C0);
traits.initAcc(C1);
if(nr==4) traits.initAcc(C2);
if(nr==4) traits.initAcc(C3);
traits.initAcc(C4);
traits.initAcc(C5);
if(nr==4) traits.initAcc(C6);
if(nr==4) traits.initAcc(C7);
ResScalar* r0 = &res[(j2+0)*resStride + i];
ResScalar* r1 = r0 + resStride;
ResScalar* r2 = r1 + resStride;
ResScalar* r3 = r2 + resStride;
prefetch(r0+16);
prefetch(r1+16);
prefetch(r2+16);
prefetch(r3+16);
// performs "inner" product
// TODO let's check wether the folowing peeled loop could not be
// optimized via optimal prefetching from one loop to the other
const RhsScalar* blB = unpackedB;
for(Index k=0; k<peeled_kc; k+=4)
{
if(nr==2)
{
LhsPacket A0, A1;
RhsPacket B_0;
RhsPacket T0;
EIGEN_ASM_COMMENT("mybegin2");
traits.loadLhs(&blA[0*LhsProgress], A0);
traits.loadLhs(&blA[1*LhsProgress], A1);
traits.loadRhs(&blB[0*RhsProgress], B_0);
traits.madd(A0,B_0,C0,T0);
traits.madd(A1,B_0,C4,B_0);
traits.loadRhs(&blB[1*RhsProgress], B_0);
traits.madd(A0,B_0,C1,T0);
traits.madd(A1,B_0,C5,B_0);
traits.loadLhs(&blA[2*LhsProgress], A0);
traits.loadLhs(&blA[3*LhsProgress], A1);
traits.loadRhs(&blB[2*RhsProgress], B_0);
traits.madd(A0,B_0,C0,T0);
traits.madd(A1,B_0,C4,B_0);
traits.loadRhs(&blB[3*RhsProgress], B_0);
traits.madd(A0,B_0,C1,T0);
traits.madd(A1,B_0,C5,B_0);
traits.loadLhs(&blA[4*LhsProgress], A0);
traits.loadLhs(&blA[5*LhsProgress], A1);
traits.loadRhs(&blB[4*RhsProgress], B_0);
traits.madd(A0,B_0,C0,T0);
traits.madd(A1,B_0,C4,B_0);
traits.loadRhs(&blB[5*RhsProgress], B_0);
traits.madd(A0,B_0,C1,T0);
traits.madd(A1,B_0,C5,B_0);
traits.loadLhs(&blA[6*LhsProgress], A0);
traits.loadLhs(&blA[7*LhsProgress], A1);
traits.loadRhs(&blB[6*RhsProgress], B_0);
traits.madd(A0,B_0,C0,T0);
traits.madd(A1,B_0,C4,B_0);
traits.loadRhs(&blB[7*RhsProgress], B_0);
traits.madd(A0,B_0,C1,T0);
traits.madd(A1,B_0,C5,B_0);
EIGEN_ASM_COMMENT("myend");
}
else
{
EIGEN_ASM_COMMENT("mybegin4");
LhsPacket A0, A1;
RhsPacket B_0, B1, B2, B3;
RhsPacket T0;
traits.loadLhs(&blA[0*LhsProgress], A0);
traits.loadLhs(&blA[1*LhsProgress], A1);
traits.loadRhs(&blB[0*RhsProgress], B_0);
traits.loadRhs(&blB[1*RhsProgress], B1);
traits.madd(A0,B_0,C0,T0);
traits.loadRhs(&blB[2*RhsProgress], B2);
traits.madd(A1,B_0,C4,B_0);
traits.loadRhs(&blB[3*RhsProgress], B3);
traits.loadRhs(&blB[4*RhsProgress], B_0);
traits.madd(A0,B1,C1,T0);
traits.madd(A1,B1,C5,B1);
traits.loadRhs(&blB[5*RhsProgress], B1);
traits.madd(A0,B2,C2,T0);
traits.madd(A1,B2,C6,B2);
traits.loadRhs(&blB[6*RhsProgress], B2);
traits.madd(A0,B3,C3,T0);
traits.loadLhs(&blA[2*LhsProgress], A0);
traits.madd(A1,B3,C7,B3);
traits.loadLhs(&blA[3*LhsProgress], A1);
traits.loadRhs(&blB[7*RhsProgress], B3);
traits.madd(A0,B_0,C0,T0);
traits.madd(A1,B_0,C4,B_0);
traits.loadRhs(&blB[8*RhsProgress], B_0);
traits.madd(A0,B1,C1,T0);
traits.madd(A1,B1,C5,B1);
traits.loadRhs(&blB[9*RhsProgress], B1);
traits.madd(A0,B2,C2,T0);
traits.madd(A1,B2,C6,B2);
traits.loadRhs(&blB[10*RhsProgress], B2);
traits.madd(A0,B3,C3,T0);
traits.loadLhs(&blA[4*LhsProgress], A0);
traits.madd(A1,B3,C7,B3);
traits.loadLhs(&blA[5*LhsProgress], A1);
traits.loadRhs(&blB[11*RhsProgress], B3);
traits.madd(A0,B_0,C0,T0);
traits.madd(A1,B_0,C4,B_0);
traits.loadRhs(&blB[12*RhsProgress], B_0);
traits.madd(A0,B1,C1,T0);
traits.madd(A1,B1,C5,B1);
traits.loadRhs(&blB[13*RhsProgress], B1);
traits.madd(A0,B2,C2,T0);
traits.madd(A1,B2,C6,B2);
traits.loadRhs(&blB[14*RhsProgress], B2);
traits.madd(A0,B3,C3,T0);
traits.loadLhs(&blA[6*LhsProgress], A0);
traits.madd(A1,B3,C7,B3);
traits.loadLhs(&blA[7*LhsProgress], A1);
traits.loadRhs(&blB[15*RhsProgress], B3);
traits.madd(A0,B_0,C0,T0);
traits.madd(A1,B_0,C4,B_0);
traits.madd(A0,B1,C1,T0);
traits.madd(A1,B1,C5,B1);
traits.madd(A0,B2,C2,T0);
traits.madd(A1,B2,C6,B2);
traits.madd(A0,B3,C3,T0);
traits.madd(A1,B3,C7,B3);
}
blB += 4*nr*RhsProgress;
blA += 4*mr;
}
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
{
if(nr==2)
{
LhsPacket A0, A1;
RhsPacket B_0;
RhsPacket T0;
traits.loadLhs(&blA[0*LhsProgress], A0);
traits.loadLhs(&blA[1*LhsProgress], A1);
traits.loadRhs(&blB[0*RhsProgress], B_0);
traits.madd(A0,B_0,C0,T0);
traits.madd(A1,B_0,C4,B_0);
traits.loadRhs(&blB[1*RhsProgress], B_0);
traits.madd(A0,B_0,C1,T0);
traits.madd(A1,B_0,C5,B_0);
}
else
{
LhsPacket A0, A1;
RhsPacket B_0, B1, B2, B3;
RhsPacket T0;
traits.loadLhs(&blA[0*LhsProgress], A0);
traits.loadLhs(&blA[1*LhsProgress], A1);
traits.loadRhs(&blB[0*RhsProgress], B_0);
traits.loadRhs(&blB[1*RhsProgress], B1);
traits.madd(A0,B_0,C0,T0);
traits.loadRhs(&blB[2*RhsProgress], B2);
traits.madd(A1,B_0,C4,B_0);
traits.loadRhs(&blB[3*RhsProgress], B3);
traits.madd(A0,B1,C1,T0);
traits.madd(A1,B1,C5,B1);
traits.madd(A0,B2,C2,T0);
traits.madd(A1,B2,C6,B2);
traits.madd(A0,B3,C3,T0);
traits.madd(A1,B3,C7,B3);
}
blB += nr*RhsProgress;
blA += mr;
}
if(nr==4)
{
ResPacket R0, R1, R2, R3, R4, R5, R6;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = ploadu<ResPacket>(r0);
R1 = ploadu<ResPacket>(r1);
R2 = ploadu<ResPacket>(r2);
R3 = ploadu<ResPacket>(r3);
R4 = ploadu<ResPacket>(r0 + ResPacketSize);
R5 = ploadu<ResPacket>(r1 + ResPacketSize);
R6 = ploadu<ResPacket>(r2 + ResPacketSize);
traits.acc(C0, alphav, R0);
pstoreu(r0, R0);
R0 = ploadu<ResPacket>(r3 + ResPacketSize);
traits.acc(C1, alphav, R1);
traits.acc(C2, alphav, R2);
traits.acc(C3, alphav, R3);
traits.acc(C4, alphav, R4);
traits.acc(C5, alphav, R5);
traits.acc(C6, alphav, R6);
traits.acc(C7, alphav, R0);
pstoreu(r1, R1);
pstoreu(r2, R2);
pstoreu(r3, R3);
pstoreu(r0 + ResPacketSize, R4);
pstoreu(r1 + ResPacketSize, R5);
pstoreu(r2 + ResPacketSize, R6);
pstoreu(r3 + ResPacketSize, R0);
}
else
{
ResPacket R0, R1, R4;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = ploadu<ResPacket>(r0);
R1 = ploadu<ResPacket>(r1);
R4 = ploadu<ResPacket>(r0 + ResPacketSize);
traits.acc(C0, alphav, R0);
pstoreu(r0, R0);
R0 = ploadu<ResPacket>(r1 + ResPacketSize);
traits.acc(C1, alphav, R1);
traits.acc(C4, alphav, R4);
traits.acc(C5, alphav, R0);
pstoreu(r1, R1);
pstoreu(r0 + ResPacketSize, R4);
pstoreu(r1 + ResPacketSize, R0);
}
}
if(rows-peeled_mc>=LhsProgress)
{
Index i = peeled_mc;
const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
prefetch(&blA[0]);
// gets res block as register
AccPacket C0, C1, C2, C3;
traits.initAcc(C0);
traits.initAcc(C1);
if(nr==4) traits.initAcc(C2);
if(nr==4) traits.initAcc(C3);
// performs "inner" product
const RhsScalar* blB = unpackedB;
for(Index k=0; k<peeled_kc; k+=4)
{
if(nr==2)
{
LhsPacket A0;
RhsPacket B_0, B1;
traits.loadLhs(&blA[0*LhsProgress], A0);
traits.loadRhs(&blB[0*RhsProgress], B_0);
traits.loadRhs(&blB[1*RhsProgress], B1);
traits.madd(A0,B_0,C0,B_0);
traits.loadRhs(&blB[2*RhsProgress], B_0);
traits.madd(A0,B1,C1,B1);
traits.loadLhs(&blA[1*LhsProgress], A0);
traits.loadRhs(&blB[3*RhsProgress], B1);
traits.madd(A0,B_0,C0,B_0);
traits.loadRhs(&blB[4*RhsProgress], B_0);
traits.madd(A0,B1,C1,B1);
traits.loadLhs(&blA[2*LhsProgress], A0);
traits.loadRhs(&blB[5*RhsProgress], B1);
traits.madd(A0,B_0,C0,B_0);
traits.loadRhs(&blB[6*RhsProgress], B_0);
traits.madd(A0,B1,C1,B1);
traits.loadLhs(&blA[3*LhsProgress], A0);
traits.loadRhs(&blB[7*RhsProgress], B1);
traits.madd(A0,B_0,C0,B_0);
traits.madd(A0,B1,C1,B1);
}
else
{
LhsPacket A0;
RhsPacket B_0, B1, B2, B3;
traits.loadLhs(&blA[0*LhsProgress], A0);
traits.loadRhs(&blB[0*RhsProgress], B_0);
traits.loadRhs(&blB[1*RhsProgress], B1);
traits.madd(A0,B_0,C0,B_0);
traits.loadRhs(&blB[2*RhsProgress], B2);
traits.loadRhs(&blB[3*RhsProgress], B3);
traits.loadRhs(&blB[4*RhsProgress], B_0);
traits.madd(A0,B1,C1,B1);
traits.loadRhs(&blB[5*RhsProgress], B1);
traits.madd(A0,B2,C2,B2);
traits.loadRhs(&blB[6*RhsProgress], B2);
traits.madd(A0,B3,C3,B3);
traits.loadLhs(&blA[1*LhsProgress], A0);
traits.loadRhs(&blB[7*RhsProgress], B3);
traits.madd(A0,B_0,C0,B_0);
traits.loadRhs(&blB[8*RhsProgress], B_0);
traits.madd(A0,B1,C1,B1);
traits.loadRhs(&blB[9*RhsProgress], B1);
traits.madd(A0,B2,C2,B2);
traits.loadRhs(&blB[10*RhsProgress], B2);
traits.madd(A0,B3,C3,B3);
traits.loadLhs(&blA[2*LhsProgress], A0);
traits.loadRhs(&blB[11*RhsProgress], B3);
traits.madd(A0,B_0,C0,B_0);
traits.loadRhs(&blB[12*RhsProgress], B_0);
traits.madd(A0,B1,C1,B1);
traits.loadRhs(&blB[13*RhsProgress], B1);
traits.madd(A0,B2,C2,B2);
traits.loadRhs(&blB[14*RhsProgress], B2);
traits.madd(A0,B3,C3,B3);
traits.loadLhs(&blA[3*LhsProgress], A0);
traits.loadRhs(&blB[15*RhsProgress], B3);
traits.madd(A0,B_0,C0,B_0);
traits.madd(A0,B1,C1,B1);
traits.madd(A0,B2,C2,B2);
traits.madd(A0,B3,C3,B3);
}
blB += nr*4*RhsProgress;
blA += 4*LhsProgress;
}
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
{
if(nr==2)
{
LhsPacket A0;
RhsPacket B_0, B1;
traits.loadLhs(&blA[0*LhsProgress], A0);
traits.loadRhs(&blB[0*RhsProgress], B_0);
traits.loadRhs(&blB[1*RhsProgress], B1);
traits.madd(A0,B_0,C0,B_0);
traits.madd(A0,B1,C1,B1);
}
else
{
LhsPacket A0;
RhsPacket B_0, B1, B2, B3;
traits.loadLhs(&blA[0*LhsProgress], A0);
traits.loadRhs(&blB[0*RhsProgress], B_0);
traits.loadRhs(&blB[1*RhsProgress], B1);
traits.loadRhs(&blB[2*RhsProgress], B2);
traits.loadRhs(&blB[3*RhsProgress], B3);
traits.madd(A0,B_0,C0,B_0);
traits.madd(A0,B1,C1,B1);
traits.madd(A0,B2,C2,B2);
traits.madd(A0,B3,C3,B3);
}
blB += nr*RhsProgress;
blA += LhsProgress;
}
ResPacket R0, R1, R2, R3;
ResPacket alphav = pset1<ResPacket>(alpha);
ResScalar* r0 = &res[(j2+0)*resStride + i];
ResScalar* r1 = r0 + resStride;
ResScalar* r2 = r1 + resStride;
ResScalar* r3 = r2 + resStride;
R0 = ploadu<ResPacket>(r0);
R1 = ploadu<ResPacket>(r1);
if(nr==4) R2 = ploadu<ResPacket>(r2);
if(nr==4) R3 = ploadu<ResPacket>(r3);
traits.acc(C0, alphav, R0);
traits.acc(C1, alphav, R1);
if(nr==4) traits.acc(C2, alphav, R2);
if(nr==4) traits.acc(C3, alphav, R3);
pstoreu(r0, R0);
pstoreu(r1, R1);
if(nr==4) pstoreu(r2, R2);
if(nr==4) pstoreu(r3, R3);
}
for(Index i=peeled_mc2; i<rows; i++)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA];
prefetch(&blA[0]);
// gets a 1 x nr res block as registers
ResScalar C0(0), C1(0), C2(0), C3(0);
// TODO directly use blockB ???
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
for(Index k=0; k<depth; k++)
{
if(nr==2)
{
LhsScalar A0;
RhsScalar B_0, B1;
A0 = blA[k];
B_0 = blB[0];
B1 = blB[1];
MADD(cj,A0,B_0,C0,B_0);
MADD(cj,A0,B1,C1,B1);
}
else
{
LhsScalar A0;
RhsScalar B_0, B1, B2, B3;
A0 = blA[k];
B_0 = blB[0];
B1 = blB[1];
B2 = blB[2];
B3 = blB[3];
MADD(cj,A0,B_0,C0,B_0);
MADD(cj,A0,B1,C1,B1);
MADD(cj,A0,B2,C2,B2);
MADD(cj,A0,B3,C3,B3);
}
blB += nr;
}
res[(j2+0)*resStride + i] += alpha*C0;
res[(j2+1)*resStride + i] += alpha*C1;
if(nr==4) res[(j2+2)*resStride + i] += alpha*C2;
if(nr==4) res[(j2+3)*resStride + i] += alpha*C3;
}
}
// process remaining rhs/res columns one at a time
// => do the same but with nr==1
for(Index j2=packet_cols; j2<cols; j2++)
{
// unpack B
traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
for(Index i=0; i<peeled_mc; i+=mr)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
prefetch(&blA[0]);
// TODO move the res loads to the stores
// get res block as registers
AccPacket C0, C4;
traits.initAcc(C0);
traits.initAcc(C4);
const RhsScalar* blB = unpackedB;
for(Index k=0; k<depth; k++)
{
LhsPacket A0, A1;
RhsPacket B_0;
RhsPacket T0;
traits.loadLhs(&blA[0*LhsProgress], A0);
traits.loadLhs(&blA[1*LhsProgress], A1);
traits.loadRhs(&blB[0*RhsProgress], B_0);
traits.madd(A0,B_0,C0,T0);
traits.madd(A1,B_0,C4,B_0);
blB += RhsProgress;
blA += 2*LhsProgress;
}
ResPacket R0, R4;
ResPacket alphav = pset1<ResPacket>(alpha);
ResScalar* r0 = &res[(j2+0)*resStride + i];
R0 = ploadu<ResPacket>(r0);
R4 = ploadu<ResPacket>(r0+ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R4);
pstoreu(r0, R0);
pstoreu(r0+ResPacketSize, R4);
}
if(rows-peeled_mc>=LhsProgress)
{
Index i = peeled_mc;
const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
prefetch(&blA[0]);
AccPacket C0;
traits.initAcc(C0);
const RhsScalar* blB = unpackedB;
for(Index k=0; k<depth; k++)
{
LhsPacket A0;
RhsPacket B_0;
traits.loadLhs(blA, A0);
traits.loadRhs(blB, B_0);
traits.madd(A0, B_0, C0, B_0);
blB += RhsProgress;
blA += LhsProgress;
}
ResPacket alphav = pset1<ResPacket>(alpha);
ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
traits.acc(C0, alphav, R0);
pstoreu(&res[(j2+0)*resStride + i], R0);
}
for(Index i=peeled_mc2; i<rows; i++)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA];
prefetch(&blA[0]);
// gets a 1 x 1 res block as registers
ResScalar C0(0);
// FIXME directly use blockB ??
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
for(Index k=0; k<depth; k++)
{
LhsScalar A0 = blA[k];
RhsScalar B_0 = blB[k];
MADD(cj, A0, B_0, C0, B_0);
}
res[(j2+0)*resStride + i] += alpha*C0;
}
}
}
};
#undef CJMADD
// pack a block of the lhs
// The traversal is as follow (mr==4):
// 0 4 8 12 ...
// 1 5 9 13 ...
// 2 6 10 14 ...
// 3 7 11 15 ...
//
// 16 20 24 28 ...
// 17 21 25 29 ...
// 18 22 26 30 ...
// 19 23 27 31 ...
//
// 32 33 34 35 ...
// 36 36 38 39 ...
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs
{
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows,
Index stride=0, Index offset=0)
{
typedef typename packet_traits<Scalar>::type Packet;
enum { PacketSize = packet_traits<Scalar>::size };
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) );
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
Index count = 0;
Index peeled_mc = (rows/Pack1)*Pack1;
for(Index i=0; i<peeled_mc; i+=Pack1)
{
if(PanelMode) count += Pack1 * offset;
if(StorageOrder==ColMajor)
{
for(Index k=0; k<depth; k++)
{
Packet A, B, C, D;
if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k));
if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; }
if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; }
if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; }
if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; }
}
}
else
{
for(Index k=0; k<depth; k++)
{
// TODO add a vectorized transpose here
Index w=0;
for(; w<Pack1-3; w+=4)
{
Scalar a(cj(lhs(i+w+0, k))),
b(cj(lhs(i+w+1, k))),
c(cj(lhs(i+w+2, k))),
d(cj(lhs(i+w+3, k)));
blockA[count++] = a;
blockA[count++] = b;
blockA[count++] = c;
blockA[count++] = d;
}
if(Pack1%4)
for(;w<Pack1;++w)
blockA[count++] = cj(lhs(i+w, k));
}
}
if(PanelMode) count += Pack1 * (stride-offset-depth);
}
if(rows-peeled_mc>=Pack2)
{
if(PanelMode) count += Pack2*offset;
for(Index k=0; k<depth; k++)
for(Index w=0; w<Pack2; w++)
blockA[count++] = cj(lhs(peeled_mc+w, k));
if(PanelMode) count += Pack2 * (stride-offset-depth);
peeled_mc += Pack2;
}
for(Index i=peeled_mc; i<rows; i++)
{
if(PanelMode) count += offset;
for(Index k=0; k<depth; k++)
blockA[count++] = cj(lhs(i, k));
if(PanelMode) count += (stride-offset-depth);
}
}
};
// copy a complete panel of the rhs
// this version is optimized for column major matrices
// The traversal order is as follow: (nr==4):
// 0 1 2 3 12 13 14 15 24 27
// 4 5 6 7 16 17 18 19 25 28
// 8 9 10 11 20 21 22 23 26 29
// . . . . . . . . . .
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
{
typedef typename packet_traits<Scalar>::type Packet;
enum { PacketSize = packet_traits<Scalar>::size };
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
Index stride=0, Index offset=0)
{
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
Index packet_cols = (cols/nr) * nr;
Index count = 0;
for(Index j2=0; j2<packet_cols; j2+=nr)
{
// skip what we have before
if(PanelMode) count += nr * offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
for(Index k=0; k<depth; k++)
{
blockB[count+0] = cj(b0[k]);
blockB[count+1] = cj(b1[k]);
if(nr==4) blockB[count+2] = cj(b2[k]);
if(nr==4) blockB[count+3] = cj(b3[k]);
count += nr;
}
// skip what we have after
if(PanelMode) count += nr * (stride-offset-depth);
}
// copy the remaining columns one at a time (nr==1)
for(Index j2=packet_cols; j2<cols; ++j2)
{
if(PanelMode) count += offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
for(Index k=0; k<depth; k++)
{
blockB[count] = cj(b0[k]);
count += 1;
}
if(PanelMode) count += (stride-offset-depth);
}
}
};
// this version is optimized for row major matrices
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
{
enum { PacketSize = packet_traits<Scalar>::size };
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
Index stride=0, Index offset=0)
{
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
Index packet_cols = (cols/nr) * nr;
Index count = 0;
for(Index j2=0; j2<packet_cols; j2+=nr)
{
// skip what we have before
if(PanelMode) count += nr * offset;
for(Index k=0; k<depth; k++)
{
const Scalar* b0 = &rhs[k*rhsStride + j2];
blockB[count+0] = cj(b0[0]);
blockB[count+1] = cj(b0[1]);
if(nr==4) blockB[count+2] = cj(b0[2]);
if(nr==4) blockB[count+3] = cj(b0[3]);
count += nr;
}
// skip what we have after
if(PanelMode) count += nr * (stride-offset-depth);
}
// copy the remaining columns one at a time (nr==1)
for(Index j2=packet_cols; j2<cols; ++j2)
{
if(PanelMode) count += offset;
const Scalar* b0 = &rhs[j2];
for(Index k=0; k<depth; k++)
{
blockB[count] = cj(b0[k*rhsStride]);
count += 1;
}
if(PanelMode) count += stride-offset-depth;
}
}
};
} // end namespace internal
/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
* \sa setCpuCacheSize */
inline std::ptrdiff_t l1CacheSize()
{
std::ptrdiff_t l1, l2;
internal::manage_caching_sizes(GetAction, &l1, &l2);
return l1;
}
/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
* \sa setCpuCacheSize */
inline std::ptrdiff_t l2CacheSize()
{
std::ptrdiff_t l1, l2;
internal::manage_caching_sizes(GetAction, &l1, &l2);
return l2;
}
/** Set the cpu L1 and L2 cache sizes (in bytes).
* These values are use to adjust the size of the blocks
* for the algorithms working per blocks.
*
* \sa computeProductBlockingSizes */
inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
{
internal::manage_caching_sizes(SetAction, &l1, &l2);
}
} // end namespace Eigen
#endif // EIGEN_GENERAL_BLOCK_PANEL_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
#define EIGEN_GENERAL_MATRIX_MATRIX_H
namespace Eigen {
namespace internal {
template<typename _LhsScalar, typename _RhsScalar> class level3_blocking;
/* Specialization for a row-major destination matrix => simple transposition of the product */
template<
typename Index,
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static EIGEN_STRONG_INLINE void run(
Index rows, Index cols, Index depth,
const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsStride,
ResScalar* res, Index resStride,
ResScalar alpha,
level3_blocking<RhsScalar,LhsScalar>& blocking,
GemmParallelInfo<Index>* info = 0)
{
// transpose the product such that the result is column major
general_matrix_matrix_product<Index,
RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
ColMajor>
::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info);
}
};
/* Specialization for a col-major destination matrix
* => Blocking algorithm following Goto's paper */
template<
typename Index,
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static void run(Index rows, Index cols, Index depth,
const LhsScalar* _lhs, Index lhsStride,
const RhsScalar* _rhs, Index rhsStride,
ResScalar* res, Index resStride,
ResScalar alpha,
level3_blocking<LhsScalar,RhsScalar>& blocking,
GemmParallelInfo<Index>* info = 0)
{
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
//Index nc = blocking.nc(); // cache block size along the N direction
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
#ifdef EIGEN_HAS_OPENMP
if(info)
{
// this is the parallel version!
Index tid = omp_get_thread_num();
Index threads = omp_get_num_threads();
std::size_t sizeA = kc*mc;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0);
ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0);
RhsScalar* blockB = blocking.blockB();
eigen_internal_assert(blockB!=0);
// For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
for(Index k=0; k<depth; k+=kc)
{
const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'
// In order to reduce the chance that a thread has to wait for the other,
// let's start by packing A'.
pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc);
// Pack B_k to B' in a parallel fashion:
// each thread packs the sub block B_k,j to B'_j where j is the thread id.
// However, before copying to B'_j, we have to make sure that no other thread is still using it,
// i.e., we test that info[tid].users equals 0.
// Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
while(info[tid].users!=0) {}
info[tid].users += threads;
pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length);
// Notify the other threads that the part B'_j is ready to go.
info[tid].sync = k;
// Computes C_i += A' * B' per B'_j
for(Index shift=0; shift<threads; ++shift)
{
Index j = (tid+shift)%threads;
// At this point we have to make sure that B'_j has been updated by the thread j,
// we use testAndSetOrdered to mimic a volatile access.
// However, no need to wait for the B' part which has been updated by the current thread!
if(shift>0)
while(info[j].sync!=k) {}
gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w);
}
// Then keep going as usual with the remaining A'
for(Index i=mc; i<rows; i+=mc)
{
const Index actual_mc = (std::min)(i+mc,rows)-i;
// pack A_i,k to A'
pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
// C_i += A' * B'
gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w);
}
// Release all the sub blocks B'_j of B' for the current thread,
// i.e., we simply decrement the number of users by 1
for(Index j=0; j<threads; ++j)
#pragma omp atomic
--(info[j].users);
}
}
else
#endif // EIGEN_HAS_OPENMP
{
EIGEN_UNUSED_VARIABLE(info);
// this is the sequential version!
std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*cols;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW());
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
// (==GEMM_VAR1)
for(Index k2=0; k2<depth; k2+=kc)
{
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
// => Pack rhs's panel into a sequential chunk of memory (L2 caching)
// Note that this panel will be read as many times as the number of blocks in the lhs's
// vertical panel which is, in practice, a very low number.
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
// For each mc x kc block of the lhs's vertical panel...
// (==GEPP_VAR1)
for(Index i2=0; i2<rows; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
// We pack the lhs's block into a sequential chunk of memory (L1 caching)
// Note that this block will be read a very high number of times, which is equal to the number of
// micro vertical panel of the large rhs's panel (e.g., cols/4 times).
pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
// Everything is packed, we can now call the block * panel kernel:
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
}
}
}
}
};
/*********************************************************************************
* Specialization of GeneralProduct<> for "large" GEMM, i.e.,
* implementation of the high level wrapper to general_matrix_matrix_product
**********************************************************************************/
template<typename Lhs, typename Rhs>
struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
: traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
{};
template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
struct gemm_functor
{
gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, Scalar actualAlpha,
BlockingType& blocking)
: m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
{}
void initParallelSession() const
{
m_blocking.allocateB();
}
void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
{
if(cols==-1)
cols = m_rhs.cols();
Gemm::run(rows, cols, m_lhs.cols(),
/*(const Scalar*)*/&m_lhs.coeffRef(row,0), m_lhs.outerStride(),
/*(const Scalar*)*/&m_rhs.coeffRef(0,col), m_rhs.outerStride(),
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
m_actualAlpha, m_blocking, info);
}
protected:
const Lhs& m_lhs;
const Rhs& m_rhs;
Dest& m_dest;
Scalar m_actualAlpha;
BlockingType& m_blocking;
};
template<int StorageOrder, typename LhsScalar, typename RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor=1,
bool FiniteAtCompileTime = MaxRows!=Dynamic && MaxCols!=Dynamic && MaxDepth != Dynamic> class gemm_blocking_space;
template<typename _LhsScalar, typename _RhsScalar>
class level3_blocking
{
typedef _LhsScalar LhsScalar;
typedef _RhsScalar RhsScalar;
protected:
LhsScalar* m_blockA;
RhsScalar* m_blockB;
RhsScalar* m_blockW;
DenseIndex m_mc;
DenseIndex m_nc;
DenseIndex m_kc;
public:
level3_blocking()
: m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0)
{}
inline DenseIndex mc() const { return m_mc; }
inline DenseIndex nc() const { return m_nc; }
inline DenseIndex kc() const { return m_kc; }
inline LhsScalar* blockA() { return m_blockA; }
inline RhsScalar* blockB() { return m_blockB; }
inline RhsScalar* blockW() { return m_blockW; }
};
template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true>
: public level3_blocking<
typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
{
enum {
Transpose = StorageOrder==RowMajor,
ActualRows = Transpose ? MaxCols : MaxRows,
ActualCols = Transpose ? MaxRows : MaxCols
};
typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar;
typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
enum {
SizeA = ActualRows * MaxDepth,
SizeB = ActualCols * MaxDepth,
SizeW = MaxDepth * Traits::WorkSpaceFactor
};
EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
EIGEN_ALIGN16 RhsScalar m_staticB[SizeB];
EIGEN_ALIGN16 RhsScalar m_staticW[SizeW];
public:
gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/)
{
this->m_mc = ActualRows;
this->m_nc = ActualCols;
this->m_kc = MaxDepth;
this->m_blockA = m_staticA;
this->m_blockB = m_staticB;
this->m_blockW = m_staticW;
}
inline void allocateA() {}
inline void allocateB() {}
inline void allocateW() {}
inline void allocateAll() {}
};
template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, false>
: public level3_blocking<
typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
{
enum {
Transpose = StorageOrder==RowMajor
};
typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar;
typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
DenseIndex m_sizeA;
DenseIndex m_sizeB;
DenseIndex m_sizeW;
public:
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth)
{
this->m_mc = Transpose ? cols : rows;
this->m_nc = Transpose ? rows : cols;
this->m_kc = depth;
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc);
m_sizeA = this->m_mc * this->m_kc;
m_sizeB = this->m_kc * this->m_nc;
m_sizeW = this->m_kc*Traits::WorkSpaceFactor;
}
void allocateA()
{
if(this->m_blockA==0)
this->m_blockA = aligned_new<LhsScalar>(m_sizeA);
}
void allocateB()
{
if(this->m_blockB==0)
this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
}
void allocateW()
{
if(this->m_blockW==0)
this->m_blockW = aligned_new<RhsScalar>(m_sizeW);
}
void allocateAll()
{
allocateA();
allocateB();
allocateW();
}
~gemm_blocking_space()
{
aligned_delete(this->m_blockA, m_sizeA);
aligned_delete(this->m_blockB, m_sizeB);
aligned_delete(this->m_blockW, m_sizeW);
}
};
} // end namespace internal
template<typename Lhs, typename Rhs>
class GeneralProduct<Lhs, Rhs, GemmProduct>
: public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
{
enum {
MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
};
public:
EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
typedef typename Lhs::Scalar LhsScalar;
typedef typename Rhs::Scalar RhsScalar;
typedef Scalar ResScalar;
GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
{
typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp;
EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
}
template<typename Dest> void scaleAndAddTo(Dest& dst, Scalar alpha) const
{
eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
* RhsBlasTraits::extractScalarFactor(m_rhs);
typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
typedef internal::gemm_functor<
Scalar, Index,
internal::general_matrix_matrix_product<
Index,
LhsScalar, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
RhsScalar, (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
_ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols());
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
}
};
} // end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
namespace Eigen {
namespace internal {
/**********************************************************************
* This file implements a general A * B product while
* evaluating only one triangular part of the product.
* This is more general version of self adjoint product (C += A A^T)
* as the level 3 SYRK Blas routine.
**********************************************************************/
// forward declarations (defined at the end of this file)
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
struct tribb_kernel;
/* Optimized matrix-matrix product evaluating only one triangular half */
template <typename Index,
typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
int ResStorageOrder, int UpLo, int Version = Specialized>
struct general_matrix_matrix_triangular_product;
// as usual if the result is row major => we transpose the product
template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo, int Version>
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version>
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, ResScalar alpha)
{
general_matrix_matrix_triangular_product<Index,
RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
ColMajor, UpLo==Lower?Upper:Lower>
::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha);
}
};
template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo, int Version>
struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version>
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, ResScalar alpha)
{
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
Index kc = depth; // cache block size along the K direction
Index mc = size; // cache block size along the M direction
Index nc = size; // cache block size along the N direction
computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc);
// !!! mc must be a multiple of nr:
if(mc > Traits::nr)
mc = (mc/Traits::nr)*Traits::nr;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*size;
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0);
RhsScalar* blockB = allocatedBlockB + sizeW;
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
for(Index k2=0; k2<depth; k2+=kc)
{
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
// note that the actual rhs is the transpose/adjoint of mat
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size);
for(Index i2=0; i2<size; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,size)-i2;
pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
// the selected actual_mc * size panel of res is split into three different part:
// 1 - before the diagonal => processed with gebp or skipped
// 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
// 3 - after the diagonal => processed with gebp or skipped
if (UpLo==Lower)
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
-1, -1, 0, 0, allocatedBlockB);
sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB);
if (UpLo==Upper)
{
Index j2 = i2+actual_mc;
gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
-1, -1, 0, 0, allocatedBlockB);
}
}
}
}
};
// Optimized packed Block * packed Block product kernel evaluating only one given triangular part
// This kernel is built on top of the gebp kernel:
// - the current destination block is processed per panel of actual_mc x BlockSize
// where BlockSize is set to the minimal value allowing gebp to be as fast as possible
// - then, as usual, each panel is split into three parts along the diagonal,
// the sub blocks above and below the diagonal are processed as usual,
// while the triangular block overlapping the diagonal is evaluated into a
// small temporary buffer which is then accumulated into the result using a
// triangular traversal.
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
struct tribb_kernel
{
typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
typedef typename Traits::ResScalar ResScalar;
enum {
BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr)
};
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, ResScalar alpha, RhsScalar* workspace)
{
gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
// let's process the block per panel of actual_mc x BlockSize,
// again, each is split into three parts, etc.
for (Index j=0; j<size; j+=BlockSize)
{
Index actualBlockSize = std::min<Index>(BlockSize,size - j);
const RhsScalar* actual_b = blockB+j*depth;
if(UpLo==Upper)
gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
-1, -1, 0, 0, workspace);
// selfadjoint micro block
{
Index i = j;
buffer.setZero();
// 1 - apply the kernel on the temporary buffer
gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-1, -1, 0, 0, workspace);
// 2 - triangular accumulation
for(Index j1=0; j1<actualBlockSize; ++j1)
{
ResScalar* r = res + (j+j1)*resStride + i;
for(Index i1=UpLo==Lower ? j1 : 0;
UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
r[i1] += buffer(i1,j1);
}
}
if(UpLo==Lower)
{
Index i = j+actualBlockSize;
gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-1, -1, 0, 0, workspace);
}
}
}
};
} // end namespace internal
// high level API
template<typename MatrixType, unsigned int UpLo>
template<typename ProductDerived, typename _Lhs, typename _Rhs>
TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
{
typedef typename internal::remove_all<typename ProductDerived::LhsNested>::type Lhs;
typedef internal::blas_traits<Lhs> LhsBlasTraits;
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
typedef typename internal::remove_all<typename ProductDerived::RhsNested>::type Rhs;
typedef internal::blas_traits<Rhs> RhsBlasTraits;
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
typedef typename internal::remove_all<ActualRhs>::type _ActualRhs;
typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
typename ProductDerived::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
internal::general_matrix_matrix_triangular_product<Index,
typename Lhs::Scalar, _ActualLhs::Flags&RowMajorBit ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
typename Rhs::Scalar, _ActualRhs::Flags&RowMajorBit ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
::run(m_matrix.cols(), actualLhs.cols(),
&actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(),
const_cast<Scalar*>(m_matrix.data()), m_matrix.outerStride(), actualAlpha);
return *this;
}
} // end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
/*
Copyright (c) 2011, Intel Corporation. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors may
be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************************
* Content : Eigen bindings to Intel(R) MKL
* Level 3 BLAS SYRK/HERK implementation.
********************************************************************************
*/
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
namespace Eigen {
namespace internal {
template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
struct general_matrix_matrix_rankupdate :
general_matrix_matrix_triangular_product<
Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {};
// try to go to BLAS specialization
#define EIGEN_MKL_RANKUPDATE_SPECIALIZE(Scalar) \
template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs, int UpLo> \
struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \
{ \
if (lhs==rhs) { \
general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
} else { \
general_matrix_matrix_triangular_product<Index, \
Scalar, LhsStorageOrder, ConjugateLhs, \
Scalar, RhsStorageOrder, ConjugateRhs, \
ColMajor, UpLo, BuiltIn> \
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
} \
} \
};
EIGEN_MKL_RANKUPDATE_SPECIALIZE(double)
//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex)
EIGEN_MKL_RANKUPDATE_SPECIALIZE(float)
//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex)
// SYRK for float/double
#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \
template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \
struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
enum { \
IsLower = (UpLo&Lower) == Lower, \
LowUp = IsLower ? Lower : Upper, \
conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \
}; \
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
{ \
/* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \
\
MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \
MKLTYPE alpha_, beta_; \
\
/* Set alpha_ & beta_ */ \
assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \
} \
};
// HERK for complex data
#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, MKLTYPE, RTYPE, MKLFUNC) \
template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \
struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
enum { \
IsLower = (UpLo&Lower) == Lower, \
LowUp = IsLower ? Lower : Upper, \
conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \
}; \
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
{ \
typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
\
MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \
RTYPE alpha_, beta_; \
const EIGTYPE* a_ptr; \
\
/* Set alpha_ & beta_ */ \
/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); */\
/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1));*/ \
alpha_ = alpha.real(); \
beta_ = 1.0; \
/* Copy with conjugation in some cases*/ \
MatrixType a; \
if (conjA) { \
Map<const MatrixType, 0, OuterStride<> > mapA(lhs,n,k,OuterStride<>(lhsStride)); \
a = mapA.conjugate(); \
lda = a.outerStride(); \
a_ptr = a.data(); \
} else a_ptr=lhs; \
MKLFUNC(&uplo, &trans, &n, &k, &alpha_, (MKLTYPE*)a_ptr, &lda, &beta_, (MKLTYPE*)res, &ldc); \
} \
};
EIGEN_MKL_RANKUPDATE_R(double, double, dsyrk)
EIGEN_MKL_RANKUPDATE_R(float, float, ssyrk)
//EIGEN_MKL_RANKUPDATE_C(dcomplex, MKL_Complex16, double, zherk)
//EIGEN_MKL_RANKUPDATE_C(scomplex, MKL_Complex8, double, cherk)
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
/*
Copyright (c) 2011, Intel Corporation. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors may
be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************************
* Content : Eigen bindings to Intel(R) MKL
* General matrix-matrix product functionality based on ?GEMM.
********************************************************************************
*/
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
#define EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
namespace Eigen {
namespace internal {
/**********************************************************************
* This file implements general matrix-matrix multiplication using BLAS
* gemm function via partial specialization of
* general_matrix_matrix_product::run(..) method for float, double,
* std::complex<float> and std::complex<double> types
**********************************************************************/
// gemm specialization
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, MKLTYPE, MKLPREFIX) \
template< \
typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
{ \
static void run(Index rows, Index cols, Index depth, \
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \
EIGTYPE alpha, \
level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
GemmParallelInfo<Index>* /*info = 0*/) \
{ \
using std::conj; \
\
char transa, transb; \
MKL_INT m, n, k, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKLTYPE alpha_, beta_; \
MatrixX##EIGPREFIX a_tmp, b_tmp; \
EIGTYPE myone(1);\
\
/* Set transpose options */ \
transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
\
/* Set m, n, k */ \
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
k = (MKL_INT)depth; \
\
/* Set alpha_ & beta_ */ \
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
\
/* Set lda, ldb, ldc */ \
lda = (MKL_INT)lhsStride; \
ldb = (MKL_INT)rhsStride; \
ldc = (MKL_INT)resStride; \
\
/* Set a, b, c */ \
if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
a_tmp = lhs.conjugate(); \
a = a_tmp.data(); \
lda = a_tmp.outerStride(); \
} else a = _lhs; \
\
if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
b_tmp = rhs.conjugate(); \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
} else b = _rhs; \
\
MKLPREFIX##gemm(&transa, &transb, &m, &n, &k, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
}};
GEMM_SPECIALIZATION(double, d, double, d)
GEMM_SPECIALIZATION(float, f, float, s)
GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, z)
GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, c)
} // end namespase internal
} // end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
#define EIGEN_GENERAL_MATRIX_VECTOR_H
namespace Eigen {
namespace internal {
/* Optimized col-major matrix * vector product:
* This algorithm processes 4 columns at onces that allows to both reduce
* the number of load/stores of the result by a factor 4 and to reduce
* the instruction dependency. Moreover, we know that all bands have the
* same alignment pattern.
*
* Mixing type logic: C += alpha * A * B
* | A | B |alpha| comments
* |real |cplx |cplx | no vectorization
* |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
* |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
* |cplx |real |real | optimal case, vectorization possible via real-cplx mul
*/
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
enum {
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
&& int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
};
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
typedef typename packet_traits<ResScalar>::type _ResPacket;
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
EIGEN_DONT_INLINE static void run(
Index rows, Index cols,
const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsIncr,
ResScalar* res, Index
#ifdef EIGEN_INTERNAL_DEBUGGING
resIncr
#endif
, RhsScalar alpha)
{
eigen_internal_assert(resIncr==1);
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
pstore(&res[j], \
padd(pload<ResPacket>(&res[j]), \
padd( \
padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
if(ConjugateRhs)
alpha = conj(alpha);
enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
const Index columnsAtOnce = 4;
const Index peels = 2;
const Index LhsPacketAlignedMask = LhsPacketSize-1;
const Index ResPacketAlignedMask = ResPacketSize-1;
const Index size = rows;
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type.
Index alignedStart = internal::first_aligned(res,size);
Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
Index alignmentPattern = alignmentStep==0 ? AllAligned
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
: FirstAligned;
// we cannot assume the first element is aligned because of sub-matrices
const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
// find how many columns do we have to skip to be aligned with the result (if possible)
Index skipColumns = 0;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
{
alignedSize = 0;
alignedStart = 0;
}
else if (LhsPacketSize>1)
{
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
while (skipColumns<LhsPacketSize &&
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
++skipColumns;
if (skipColumns==LhsPacketSize)
{
// nothing can be aligned, no need to skip any column
alignmentPattern = NoneAligned;
skipColumns = 0;
}
else
{
skipColumns = (std::min)(skipColumns,cols);
// note that the skiped columns are processed later.
}
eigen_internal_assert( (alignmentPattern==NoneAligned)
|| (skipColumns + columnsAtOnce >= cols)
|| LhsPacketSize > size
|| (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
}
else if(Vectorizable)
{
alignedStart = 0;
alignedSize = size;
alignmentPattern = AllAligned;
}
Index offset1 = (FirstAligned && alignmentStep==1?3:1);
Index offset3 = (FirstAligned && alignmentStep==1?1:3);
Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
{
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
// this helps a lot generating better binary code
const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
if (Vectorizable)
{
/* explicit vectorization */
// process initial unaligned coeffs
for (Index j=0; j<alignedStart; ++j)
{
res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
}
if (alignedSize>alignedStart)
{
switch(alignmentPattern)
{
case AllAligned:
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,d,d);
break;
case EvenAligned:
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,d);
break;
case FirstAligned:
{
Index j = alignedStart;
if(peels>1)
{
LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
ResPacket T0, T1;
A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
for (; j<peeledSize; j+=peels*ResPacketSize)
{
A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
A00 = pload<LhsPacket>(&lhs0[j]);
A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
T0 = pcj.pmadd(A01, ptmp1, T0);
A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
T0 = pcj.pmadd(A02, ptmp2, T0);
A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
T0 = pcj.pmadd(A03, ptmp3, T0);
pstore(&res[j],T0);
A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
T1 = pcj.pmadd(A11, ptmp1, T1);
T1 = pcj.pmadd(A12, ptmp2, T1);
T1 = pcj.pmadd(A13, ptmp3, T1);
pstore(&res[j+ResPacketSize],T1);
}
}
for (; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,du);
break;
}
default:
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(du,du,du);
break;
}
}
} // end explicit vectorization
/* process remaining coeffs (or all if there is no explicit vectorization) */
for (Index j=alignedSize; j<size; ++j)
{
res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
}
}
// process remaining first and last columns (at most columnsAtOnce-1)
Index end = cols;
Index start = columnBound;
do
{
for (Index k=start; k<end; ++k)
{
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
const LhsScalar* lhs0 = lhs + k*lhsStride;
if (Vectorizable)
{
/* explicit vectorization */
// process first unaligned result's coeffs
for (Index j=0; j<alignedStart; ++j)
res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
// process aligned result's coeffs
if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
else
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
}
// process remaining scalars (or all if no explicit vectorization)
for (Index i=alignedSize; i<size; ++i)
res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
}
if (skipColumns)
{
start = 0;
end = skipColumns;
skipColumns = 0;
}
else
break;
} while(Vectorizable);
#undef _EIGEN_ACCUMULATE_PACKETS
}
};
/* Optimized row-major matrix * vector product:
* This algorithm processes 4 rows at onces that allows to both reduce
* the number of load/stores of the result by a factor 4 and to reduce
* the instruction dependency. Moreover, we know that all bands have the
* same alignment pattern.
*
* Mixing type logic:
* - alpha is always a complex (or converted to a complex)
* - no vectorization
*/
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
enum {
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
&& int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
};
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
typedef typename packet_traits<ResScalar>::type _ResPacket;
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
EIGEN_DONT_INLINE static void run(
Index rows, Index cols,
const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsIncr,
ResScalar* res, Index resIncr,
ResScalar alpha)
{
EIGEN_UNUSED_VARIABLE(rhsIncr);
eigen_internal_assert(rhsIncr==1);
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
RhsPacket b = pload<RhsPacket>(&rhs[j]); \
ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
const Index rowsAtOnce = 4;
const Index peels = 2;
const Index RhsPacketAlignedMask = RhsPacketSize-1;
const Index LhsPacketAlignedMask = LhsPacketSize-1;
const Index depth = cols;
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type
// if that's not the case then vectorization is discarded, see below.
Index alignedStart = internal::first_aligned(rhs, depth);
Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
Index alignmentPattern = alignmentStep==0 ? AllAligned
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
: FirstAligned;
// we cannot assume the first element is aligned because of sub-matrices
const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
// find how many rows do we have to skip to be aligned with rhs (if possible)
Index skipRows = 0;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
{
alignedSize = 0;
alignedStart = 0;
}
else if (LhsPacketSize>1)
{
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
while (skipRows<LhsPacketSize &&
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
++skipRows;
if (skipRows==LhsPacketSize)
{
// nothing can be aligned, no need to skip any column
alignmentPattern = NoneAligned;
skipRows = 0;
}
else
{
skipRows = (std::min)(skipRows,Index(rows));
// note that the skiped columns are processed later.
}
eigen_internal_assert( alignmentPattern==NoneAligned
|| LhsPacketSize==1
|| (skipRows + rowsAtOnce >= rows)
|| LhsPacketSize > depth
|| (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
}
else if(Vectorizable)
{
alignedStart = 0;
alignedSize = depth;
alignmentPattern = AllAligned;
}
Index offset1 = (FirstAligned && alignmentStep==1?3:1);
Index offset3 = (FirstAligned && alignmentStep==1?1:3);
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
{
EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
// this helps the compiler generating good binary code
const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
if (Vectorizable)
{
/* explicit vectorization */
ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
// process initial unaligned coeffs
// FIXME this loop get vectorized by the compiler !
for (Index j=0; j<alignedStart; ++j)
{
RhsScalar b = rhs[j];
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
}
if (alignedSize>alignedStart)
{
switch(alignmentPattern)
{
case AllAligned:
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,d,d);
break;
case EvenAligned:
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,d);
break;
case FirstAligned:
{
Index j = alignedStart;
if (peels>1)
{
/* Here we proccess 4 rows with with two peeled iterations to hide
* the overhead of unaligned loads. Moreover unaligned loads are handled
* using special shift/move operations between the two aligned packets
* overlaping the desired unaligned packet. This is *much* more efficient
* than basic unaligned loads.
*/
LhsPacket A01, A02, A03, A11, A12, A13;
A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
for (; j<peeledSize; j+=peels*RhsPacketSize)
{
RhsPacket b = pload<RhsPacket>(&rhs[j]);
A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
ptmp1 = pcj.pmadd(A01, b, ptmp1);
A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
ptmp2 = pcj.pmadd(A02, b, ptmp2);
A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
ptmp3 = pcj.pmadd(A03, b, ptmp3);
A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
ptmp1 = pcj.pmadd(A11, b, ptmp1);
ptmp2 = pcj.pmadd(A12, b, ptmp2);
ptmp3 = pcj.pmadd(A13, b, ptmp3);
}
}
for (; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,du);
break;
}
default:
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(du,du,du);
break;
}
tmp0 += predux(ptmp0);
tmp1 += predux(ptmp1);
tmp2 += predux(ptmp2);
tmp3 += predux(ptmp3);
}
} // end explicit vectorization
// process remaining coeffs (or all if no explicit vectorization)
// FIXME this loop get vectorized by the compiler !
for (Index j=alignedSize; j<depth; ++j)
{
RhsScalar b = rhs[j];
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
}
res[i*resIncr] += alpha*tmp0;
res[(i+offset1)*resIncr] += alpha*tmp1;
res[(i+2)*resIncr] += alpha*tmp2;
res[(i+offset3)*resIncr] += alpha*tmp3;
}
// process remaining first and last rows (at most columnsAtOnce-1)
Index end = rows;
Index start = rowBound;
do
{
for (Index i=start; i<end; ++i)
{
EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
const LhsScalar* lhs0 = lhs + i*lhsStride;
// process first unaligned result's coeffs
// FIXME this loop get vectorized by the compiler !
for (Index j=0; j<alignedStart; ++j)
tmp0 += cj.pmul(lhs0[j], rhs[j]);
if (alignedSize>alignedStart)
{
// process aligned rhs coeffs
if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
else
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
tmp0 += predux(ptmp0);
}
// process remaining scalars
// FIXME this loop get vectorized by the compiler !
for (Index j=alignedSize; j<depth; ++j)
tmp0 += cj.pmul(lhs0[j], rhs[j]);
res[i*resIncr] += alpha*tmp0;
}
if (skipRows)
{
start = 0;
end = skipRows;
skipRows = 0;
}
else
break;
} while(Vectorizable);
#undef _EIGEN_ACCUMULATE_PACKETS
}
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
/*
Copyright (c) 2011, Intel Corporation. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors may
be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************************
* Content : Eigen bindings to Intel(R) MKL
* General matrix-vector product functionality based on ?GEMV.
********************************************************************************
*/
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
#define EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
namespace Eigen {
namespace internal {
/**********************************************************************
* This file implements general matrix-vector multiplication using BLAS
* gemv function via partial specialization of
* general_matrix_vector_product::run(..) method for float, double,
* std::complex<float> and std::complex<double> types
**********************************************************************/
// gemv specialization
template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
struct general_matrix_vector_product_gemv :
general_matrix_vector_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,ConjugateRhs,BuiltIn> {};
#define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \
template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
struct general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
static EIGEN_DONT_INLINE void run( \
Index rows, Index cols, \
const Scalar* lhs, Index lhsStride, \
const Scalar* rhs, Index rhsIncr, \
Scalar* res, Index resIncr, Scalar alpha) \
{ \
if (ConjugateLhs) { \
general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,BuiltIn>::run( \
rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
} else { \
general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
} \
} \
}; \
template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
struct general_matrix_vector_product<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
static EIGEN_DONT_INLINE void run( \
Index rows, Index cols, \
const Scalar* lhs, Index lhsStride, \
const Scalar* rhs, Index rhsIncr, \
Scalar* res, Index resIncr, Scalar alpha) \
{ \
general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
} \
}; \
EIGEN_MKL_GEMV_SPECIALIZE(double)
EIGEN_MKL_GEMV_SPECIALIZE(float)
EIGEN_MKL_GEMV_SPECIALIZE(dcomplex)
EIGEN_MKL_GEMV_SPECIALIZE(scomplex)
#define EIGEN_MKL_GEMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLPREFIX) \
template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
{ \
typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> GEMVVector;\
\
static EIGEN_DONT_INLINE void run( \
Index rows, Index cols, \
const EIGTYPE* lhs, Index lhsStride, \
const EIGTYPE* rhs, Index rhsIncr, \
EIGTYPE* res, Index resIncr, EIGTYPE alpha) \
{ \
MKL_INT m=rows, n=cols, lda=lhsStride, incx=rhsIncr, incy=resIncr; \
MKLTYPE alpha_, beta_; \
const EIGTYPE *x_ptr, myone(1); \
char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \
if (LhsStorageOrder==RowMajor) { \
m=cols; \
n=rows; \
}\
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
GEMVVector x_tmp; \
if (ConjugateRhs) { \
Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \
x_tmp=map_x.conjugate(); \
x_ptr=x_tmp.data(); \
incx=1; \
} else x_ptr=rhs; \
MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
}\
};
EIGEN_MKL_GEMV_SPECIALIZATION(double, double, d)
EIGEN_MKL_GEMV_SPECIALIZATION(float, float, s)
EIGEN_MKL_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, z)
EIGEN_MKL_GEMV_SPECIALIZATION(scomplex, MKL_Complex8, c)
} // end namespase internal
} // end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PARALLELIZER_H
#define EIGEN_PARALLELIZER_H
namespace Eigen {
namespace internal {
/** \internal */
inline void manage_multi_threading(Action action, int* v)
{
static EIGEN_UNUSED int m_maxThreads = -1;
if(action==SetAction)
{
eigen_internal_assert(v!=0);
m_maxThreads = *v;
}
else if(action==GetAction)
{
eigen_internal_assert(v!=0);
#ifdef EIGEN_HAS_OPENMP
if(m_maxThreads>0)
*v = m_maxThreads;
else
*v = omp_get_max_threads();
#else
*v = 1;
#endif
}
else
{
eigen_internal_assert(false);
}
}
}
/** Must be call first when calling Eigen from multiple threads */
inline void initParallel()
{
int nbt;
internal::manage_multi_threading(GetAction, &nbt);
std::ptrdiff_t l1, l2;
internal::manage_caching_sizes(GetAction, &l1, &l2);
}
/** \returns the max number of threads reserved for Eigen
* \sa setNbThreads */
inline int nbThreads()
{
int ret;
internal::manage_multi_threading(GetAction, &ret);
return ret;
}
/** Sets the max number of threads reserved for Eigen
* \sa nbThreads */
inline void setNbThreads(int v)
{
internal::manage_multi_threading(SetAction, &v);
}
namespace internal {
template<typename Index> struct GemmParallelInfo
{
GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {}
int volatile sync;
int volatile users;
Index rhs_start;
Index rhs_length;
};
template<bool Condition, typename Functor, typename Index>
void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose)
{
// TODO when EIGEN_USE_BLAS is defined,
// we should still enable OMP for other scalar types
#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS)
// FIXME the transpose variable is only needed to properly split
// the matrix product when multithreading is enabled. This is a temporary
// fix to support row-major destination matrices. This whole
// parallelizer mechanism has to be redisigned anyway.
EIGEN_UNUSED_VARIABLE(transpose);
func(0,rows, 0,cols);
#else
// Dynamically check whether we should enable or disable OpenMP.
// The conditions are:
// - the max number of threads we can create is greater than 1
// - we are not already in a parallel code
// - the sizes are large enough
// 1- are we already in a parallel session?
// FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
if((!Condition) || (omp_get_num_threads()>1))
return func(0,rows, 0,cols);
Index size = transpose ? cols : rows;
// 2- compute the maximal number of threads from the size of the product:
// FIXME this has to be fine tuned
Index max_threads = std::max<Index>(1,size / 32);
// 3 - compute the number of threads we are going to use
Index threads = std::min<Index>(nbThreads(), max_threads);
if(threads==1)
return func(0,rows, 0,cols);
Eigen::initParallel();
func.initParallelSession();
if(transpose)
std::swap(rows,cols);
Index blockCols = (cols / threads) & ~Index(0x3);
Index blockRows = (rows / threads) & ~Index(0x7);
GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];
#pragma omp parallel for schedule(static,1) num_threads(threads)
for(Index i=0; i<threads; ++i)
{
Index r0 = i*blockRows;
Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
Index c0 = i*blockCols;
Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
info[i].rhs_start = c0;
info[i].rhs_length = actualBlockCols;
if(transpose)
func(0, cols, r0, actualBlockRows, info);
else
func(r0, actualBlockRows, 0,cols, info);
}
delete[] info;
#endif
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_PARALLELIZER_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H
#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H
namespace Eigen {
namespace internal {
// pack a selfadjoint block diagonal for use with the gebp_kernel
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder>
struct symm_pack_lhs
{
template<int BlockRows> inline
void pack(Scalar* blockA, const const_blas_data_mapper<Scalar,Index,StorageOrder>& lhs, Index cols, Index i, Index& count)
{
// normal copy
for(Index k=0; k<i; k++)
for(Index w=0; w<BlockRows; w++)
blockA[count++] = lhs(i+w,k); // normal
// symmetric copy
Index h = 0;
for(Index k=i; k<i+BlockRows; k++)
{
for(Index w=0; w<h; w++)
blockA[count++] = conj(lhs(k, i+w)); // transposed
blockA[count++] = real(lhs(k,k)); // real (diagonal)
for(Index w=h+1; w<BlockRows; w++)
blockA[count++] = lhs(i+w, k); // normal
++h;
}
// transposed copy
for(Index k=i+BlockRows; k<cols; k++)
for(Index w=0; w<BlockRows; w++)
blockA[count++] = conj(lhs(k, i+w)); // transposed
}
void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
{
const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
Index count = 0;
Index peeled_mc = (rows/Pack1)*Pack1;
for(Index i=0; i<peeled_mc; i+=Pack1)
{
pack<Pack1>(blockA, lhs, cols, i, count);
}
if(rows-peeled_mc>=Pack2)
{
pack<Pack2>(blockA, lhs, cols, peeled_mc, count);
peeled_mc += Pack2;
}
// do the same with mr==1
for(Index i=peeled_mc; i<rows; i++)
{
for(Index k=0; k<i; k++)
blockA[count++] = lhs(i, k); // normal
blockA[count++] = real(lhs(i, i)); // real (diagonal)
for(Index k=i+1; k<cols; k++)
blockA[count++] = conj(lhs(k, i)); // transposed
}
}
};
template<typename Scalar, typename Index, int nr, int StorageOrder>
struct symm_pack_rhs
{
enum { PacketSize = packet_traits<Scalar>::size };
void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
{
Index end_k = k2 + rows;
Index count = 0;
const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
Index packet_cols = (cols/nr)*nr;
// first part: normal case
for(Index j2=0; j2<k2; j2+=nr)
{
for(Index k=k2; k<end_k; k++)
{
blockB[count+0] = rhs(k,j2+0);
blockB[count+1] = rhs(k,j2+1);
if (nr==4)
{
blockB[count+2] = rhs(k,j2+2);
blockB[count+3] = rhs(k,j2+3);
}
count += nr;
}
}
// second part: diagonal block
for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr)
{
// again we can split vertically in three different parts (transpose, symmetric, normal)
// transpose
for(Index k=k2; k<j2; k++)
{
blockB[count+0] = conj(rhs(j2+0,k));
blockB[count+1] = conj(rhs(j2+1,k));
if (nr==4)
{
blockB[count+2] = conj(rhs(j2+2,k));
blockB[count+3] = conj(rhs(j2+3,k));
}
count += nr;
}
// symmetric
Index h = 0;
for(Index k=j2; k<j2+nr; k++)
{
// normal
for (Index w=0 ; w<h; ++w)
blockB[count+w] = rhs(k,j2+w);
blockB[count+h] = real(rhs(k,k));
// transpose
for (Index w=h+1 ; w<nr; ++w)
blockB[count+w] = conj(rhs(j2+w,k));
count += nr;
++h;
}
// normal
for(Index k=j2+nr; k<end_k; k++)
{
blockB[count+0] = rhs(k,j2+0);
blockB[count+1] = rhs(k,j2+1);
if (nr==4)
{
blockB[count+2] = rhs(k,j2+2);
blockB[count+3] = rhs(k,j2+3);
}
count += nr;
}
}
// third part: transposed
for(Index j2=k2+rows; j2<packet_cols; j2+=nr)
{
for(Index k=k2; k<end_k; k++)
{
blockB[count+0] = conj(rhs(j2+0,k));
blockB[count+1] = conj(rhs(j2+1,k));
if (nr==4)
{
blockB[count+2] = conj(rhs(j2+2,k));
blockB[count+3] = conj(rhs(j2+3,k));
}
count += nr;
}
}
// copy the remaining columns one at a time (=> the same with nr==1)
for(Index j2=packet_cols; j2<cols; ++j2)
{
// transpose
Index half = (std::min)(end_k,j2);
for(Index k=k2; k<half; k++)
{
blockB[count] = conj(rhs(j2,k));
count += 1;
}
if(half==j2 && half<k2+rows)
{
blockB[count] = real(rhs(j2,j2));
count += 1;
}
else
half--;
// normal
for(Index k=half+1; k<k2+rows; k++)
{
blockB[count] = rhs(k,j2);
count += 1;
}
}
}
};
/* Optimized selfadjoint matrix * matrix (_SYMM) product built on top of
* the general matrix matrix product.
*/
template <typename Scalar, typename Index,
int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
int ResStorageOrder>
struct product_selfadjoint_matrix;
template <typename Scalar, typename Index,
int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs>
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor>
{
static EIGEN_STRONG_INLINE void run(
Index rows, Index cols,
const Scalar* lhs, Index lhsStride,
const Scalar* rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar alpha)
{
product_selfadjoint_matrix<Scalar, Index,
EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs),
EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
ColMajor>
::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha);
}
};
template <typename Scalar, typename Index,
int LhsStorageOrder, bool ConjugateLhs,
int RhsStorageOrder, bool ConjugateRhs>
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>
{
static EIGEN_DONT_INLINE void run(
Index rows, Index cols,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar alpha)
{
Index size = rows;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef gebp_traits<Scalar,Scalar> Traits;
Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
// kc must smaller than mc
kc = (std::min)(kc,mc);
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*cols;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
Scalar* blockB = allocatedBlockB + sizeW;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
for(Index k2=0; k2<size; k2+=kc)
{
const Index actual_kc = (std::min)(k2+kc,size)-k2;
// we have selected one row panel of rhs and one column panel of lhs
// pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
// the select lhs's panel has to be split in three different parts:
// 1 - the transposed panel above the diagonal block => transposed packed copy
// 2 - the diagonal block => special packed copy
// 3 - the panel below the diagonal block => generic packed copy
for(Index i2=0; i2<k2; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,k2)-i2;
// transposed packed copy
pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
// the block diagonal
{
const Index actual_mc = (std::min)(k2+kc,size)-k2;
// symmetric packed copy
pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
for(Index i2=k2+kc; i2<size; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,size)-i2;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
}
}
};
// matrix * selfadjoint product
template <typename Scalar, typename Index,
int LhsStorageOrder, bool ConjugateLhs,
int RhsStorageOrder, bool ConjugateRhs>
struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>
{
static EIGEN_DONT_INLINE void run(
Index rows, Index cols,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar alpha)
{
Index size = cols;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
typedef gebp_traits<Scalar,Scalar> Traits;
Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*cols;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
Scalar* blockB = allocatedBlockB + sizeW;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=0; k2<size; k2+=kc)
{
const Index actual_kc = (std::min)(k2+kc,size)-k2;
pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2);
// => GEPP
for(Index i2=0; i2<rows; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
}
}
};
} // end namespace internal
/***************************************************************************
* Wrapper to product_selfadjoint_matrix
***************************************************************************/
namespace internal {
template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> >
: traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> >
{};
}
template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
: public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs >
{
EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
enum {
LhsIsUpper = (LhsMode&(Upper|Lower))==Upper,
LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint,
RhsIsUpper = (RhsMode&(Upper|Lower))==Upper,
RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint
};
template<typename Dest> void scaleAndAddTo(Dest& dst, Scalar alpha) const
{
eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
* RhsBlasTraits::extractScalarFactor(m_rhs);
internal::product_selfadjoint_matrix<Scalar, Index,
EIGEN_LOGICAL_XOR(LhsIsUpper,
internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
EIGEN_LOGICAL_XOR(RhsIsUpper,
internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor>
::run(
lhs.rows(), rhs.cols(), // sizes
&lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
&rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
&dst.coeffRef(0,0), dst.outerStride(), // result info
actualAlpha // alpha
);
}
};
} // end namespace Eigen
#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
/*
Copyright (c) 2011, Intel Corporation. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors may
be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************************
* Content : Eigen bindings to Intel(R) MKL
* Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM.
********************************************************************************
*/
#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
#define EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
namespace Eigen {
namespace internal {
/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
#define EIGEN_MKL_SYMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
{\
\
static EIGEN_DONT_INLINE void run( \
Index rows, Index cols, \
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \
EIGTYPE alpha) \
{ \
char side='L', uplo='L'; \
MKL_INT m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKLTYPE alpha_, beta_; \
MatrixX##EIGPREFIX b_tmp; \
EIGTYPE myone(1);\
\
/* Set transpose options */ \
/* Set m, n, k */ \
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
\
/* Set alpha_ & beta_ */ \
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
\
/* Set lda, ldb, ldc */ \
lda = (MKL_INT)lhsStride; \
ldb = (MKL_INT)rhsStride; \
ldc = (MKL_INT)resStride; \
\
/* Set a, b, c */ \
if (LhsStorageOrder==RowMajor) uplo='U'; \
a = _lhs; \
\
if (RhsStorageOrder==RowMajor) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
b_tmp = rhs.adjoint(); \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
} else b = _rhs; \
\
MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
\
} \
};
#define EIGEN_MKL_HEMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
{\
static EIGEN_DONT_INLINE void run( \
Index rows, Index cols, \
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \
EIGTYPE alpha) \
{ \
char side='L', uplo='L'; \
MKL_INT m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKLTYPE alpha_, beta_; \
MatrixX##EIGPREFIX b_tmp; \
Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \
EIGTYPE myone(1); \
\
/* Set transpose options */ \
/* Set m, n, k */ \
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
\
/* Set alpha_ & beta_ */ \
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
\
/* Set lda, ldb, ldc */ \
lda = (MKL_INT)lhsStride; \
ldb = (MKL_INT)rhsStride; \
ldc = (MKL_INT)resStride; \
\
/* Set a, b, c */ \
if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \
Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \
a_tmp = lhs.conjugate(); \
a = a_tmp.data(); \
lda = a_tmp.outerStride(); \
} else a = _lhs; \
if (LhsStorageOrder==RowMajor) uplo='U'; \
\
if (RhsStorageOrder==ColMajor && (!ConjugateRhs)) { \
b = _rhs; } \
else { \
if (RhsStorageOrder==ColMajor && ConjugateRhs) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,m,n,OuterStride<>(rhsStride)); \
b_tmp = rhs.conjugate(); \
} else \
if (ConjugateRhs) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
b_tmp = rhs.adjoint(); \
} else { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
b_tmp = rhs.transpose(); \
} \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
} \
\
MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
\
} \
};
EIGEN_MKL_SYMM_L(double, double, d, d)
EIGEN_MKL_SYMM_L(float, float, f, s)
EIGEN_MKL_HEMM_L(dcomplex, MKL_Complex16, cd, z)
EIGEN_MKL_HEMM_L(scomplex, MKL_Complex8, cf, c)
/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
#define EIGEN_MKL_SYMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
{\
\
static EIGEN_DONT_INLINE void run( \
Index rows, Index cols, \
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \
EIGTYPE alpha) \
{ \
char side='R', uplo='L'; \
MKL_INT m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKLTYPE alpha_, beta_; \
MatrixX##EIGPREFIX b_tmp; \
EIGTYPE myone(1);\
\
/* Set m, n, k */ \
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
\
/* Set alpha_ & beta_ */ \
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
\
/* Set lda, ldb, ldc */ \
lda = (MKL_INT)rhsStride; \
ldb = (MKL_INT)lhsStride; \
ldc = (MKL_INT)resStride; \
\
/* Set a, b, c */ \
if (RhsStorageOrder==RowMajor) uplo='U'; \
a = _rhs; \
\
if (LhsStorageOrder==RowMajor) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \
b_tmp = lhs.adjoint(); \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
} else b = _lhs; \
\
MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
\
} \
};
#define EIGEN_MKL_HEMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
{\
static EIGEN_DONT_INLINE void run( \
Index rows, Index cols, \
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \
EIGTYPE alpha) \
{ \
char side='R', uplo='L'; \
MKL_INT m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKLTYPE alpha_, beta_; \
MatrixX##EIGPREFIX b_tmp; \
Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \
EIGTYPE myone(1); \
\
/* Set m, n, k */ \
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
\
/* Set alpha_ & beta_ */ \
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
\
/* Set lda, ldb, ldc */ \
lda = (MKL_INT)rhsStride; \
ldb = (MKL_INT)lhsStride; \
ldc = (MKL_INT)resStride; \
\
/* Set a, b, c */ \
if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \
Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \
a_tmp = rhs.conjugate(); \
a = a_tmp.data(); \
lda = a_tmp.outerStride(); \
} else a = _rhs; \
if (RhsStorageOrder==RowMajor) uplo='U'; \
\
if (LhsStorageOrder==ColMajor && (!ConjugateLhs)) { \
b = _lhs; } \
else { \
if (LhsStorageOrder==ColMajor && ConjugateLhs) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,n,OuterStride<>(lhsStride)); \
b_tmp = lhs.conjugate(); \
} else \
if (ConjugateLhs) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
b_tmp = lhs.adjoint(); \
} else { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
b_tmp = lhs.transpose(); \
} \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
} \
\
MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
} \
};
EIGEN_MKL_SYMM_R(double, double, d, d)
EIGEN_MKL_SYMM_R(float, float, f, s)
EIGEN_MKL_HEMM_R(dcomplex, MKL_Complex16, cd, z)
EIGEN_MKL_HEMM_R(scomplex, MKL_Complex8, cf, c)
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H
namespace Eigen {
namespace internal {
/* Optimized selfadjoint matrix * vector product:
* This algorithm processes 2 columns at onces that allows to both reduce
* the number of load/stores of the result by a factor 2 and to reduce
* the instruction dependency.
*/
template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version=Specialized>
struct selfadjoint_matrix_vector_product;
template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
struct selfadjoint_matrix_vector_product
{
static EIGEN_DONT_INLINE void run(
Index size,
const Scalar* lhs, Index lhsStride,
const Scalar* _rhs, Index rhsIncr,
Scalar* res,
Scalar alpha)
{
typedef typename packet_traits<Scalar>::type Packet;
typedef typename NumTraits<Scalar>::Real RealScalar;
const Index PacketSize = sizeof(Packet)/sizeof(Scalar);
enum {
IsRowMajor = StorageOrder==RowMajor ? 1 : 0,
IsLower = UpLo == Lower ? 1 : 0,
FirstTriangular = IsRowMajor == IsLower
};
conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0;
conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex, ConjugateRhs> cjd;
conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0;
conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1;
Scalar cjAlpha = ConjugateRhs ? conj(alpha) : alpha;
// FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed.
// if the rhs is not sequentially stored in memory we copy it to a temporary buffer,
// this is because we need to extract packets
ei_declare_aligned_stack_constructed_variable(Scalar,rhs,size,rhsIncr==1 ? const_cast<Scalar*>(_rhs) : 0);
if (rhsIncr!=1)
{
const Scalar* it = _rhs;
for (Index i=0; i<size; ++i, it+=rhsIncr)
rhs[i] = *it;
}
Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
if (FirstTriangular)
bound = size - bound;
for (Index j=FirstTriangular ? bound : 0;
j<(FirstTriangular ? size : bound);j+=2)
{
register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
register const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
Scalar t0 = cjAlpha * rhs[j];
Packet ptmp0 = pset1<Packet>(t0);
Scalar t1 = cjAlpha * rhs[j+1];
Packet ptmp1 = pset1<Packet>(t1);
Scalar t2(0);
Packet ptmp2 = pset1<Packet>(t2);
Scalar t3(0);
Packet ptmp3 = pset1<Packet>(t3);
size_t starti = FirstTriangular ? 0 : j+2;
size_t endi = FirstTriangular ? j : size;
size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti);
size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
// TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
res[j] += cjd.pmul(internal::real(A0[j]), t0);
res[j+1] += cjd.pmul(internal::real(A1[j+1]), t1);
if(FirstTriangular)
{
res[j] += cj0.pmul(A1[j], t1);
t3 += cj1.pmul(A1[j], rhs[j]);
}
else
{
res[j+1] += cj0.pmul(A0[j+1],t0);
t2 += cj1.pmul(A0[j+1], rhs[j+1]);
}
for (size_t i=starti; i<alignedStart; ++i)
{
res[i] += t0 * A0[i] + t1 * A1[i];
t2 += conj(A0[i]) * rhs[i];
t3 += conj(A1[i]) * rhs[i];
}
// Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
// gcc 4.2 does this optimization automatically.
const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
{
Packet A0i = ploadu<Packet>(a0It); a0It += PacketSize;
Packet A1i = ploadu<Packet>(a1It); a1It += PacketSize;
Packet Bi = ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases
Packet Xi = pload <Packet>(resIt);
Xi = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi));
ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
pstore(resIt,Xi); resIt += PacketSize;
}
for (size_t i=alignedEnd; i<endi; i++)
{
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
t2 += cj1.pmul(A0[i], rhs[i]);
t3 += cj1.pmul(A1[i], rhs[i]);
}
res[j] += alpha * (t2 + predux(ptmp2));
res[j+1] += alpha * (t3 + predux(ptmp3));
}
for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
{
register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
Scalar t1 = cjAlpha * rhs[j];
Scalar t2(0);
// TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
res[j] += cjd.pmul(internal::real(A0[j]), t1);
for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++)
{
res[i] += cj0.pmul(A0[i], t1);
t2 += cj1.pmul(A0[i], rhs[i]);
}
res[j] += alpha * t2;
}
}
};
} // end namespace internal
/***************************************************************************
* Wrapper to product_selfadjoint_vector
***************************************************************************/
namespace internal {
template<typename Lhs, int LhsMode, typename Rhs>
struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> >
: traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> >
{};
}
template<typename Lhs, int LhsMode, typename Rhs>
struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
: public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs >
{
EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
enum {
LhsUpLo = LhsMode&(Upper|Lower)
};
SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
template<typename Dest> void scaleAndAddTo(Dest& dest, Scalar alpha) const
{
typedef typename Dest::Scalar ResScalar;
typedef typename Base::RhsScalar RhsScalar;
typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
eigen_assert(dest.rows()==m_lhs.rows() && dest.cols()==m_rhs.cols());
typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
* RhsBlasTraits::extractScalarFactor(m_rhs);
enum {
EvalToDest = (Dest::InnerStrideAtCompileTime==1),
UseRhs = (_ActualRhsType::InnerStrideAtCompileTime==1)
};
internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest;
internal::gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!UseRhs> static_rhs;
ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
EvalToDest ? dest.data() : static_dest.data());
ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,rhs.size(),
UseRhs ? const_cast<RhsScalar*>(rhs.data()) : static_rhs.data());
if(!EvalToDest)
{
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
int size = dest.size();
EIGEN_DENSE_STORAGE_CTOR_PLUGIN
#endif
MappedDest(actualDestPtr, dest.size()) = dest;
}
if(!UseRhs)
{
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
int size = rhs.size();
EIGEN_DENSE_STORAGE_CTOR_PLUGIN
#endif
Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
}
internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
(
lhs.rows(), // size
&lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
actualRhsPtr, 1, // rhs info
actualDestPtr, // result info
actualAlpha // scale factor
);
if(!EvalToDest)
dest = MappedDest(actualDestPtr, dest.size());
}
};
namespace internal {
template<typename Lhs, typename Rhs, int RhsMode>
struct traits<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false> >
: traits<ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs> >
{};
}
template<typename Lhs, typename Rhs, int RhsMode>
struct SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>
: public ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs >
{
EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
enum {
RhsUpLo = RhsMode&(Upper|Lower)
};
SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
template<typename Dest> void scaleAndAddTo(Dest& dest, Scalar alpha) const
{
// let's simply transpose the product
Transpose<Dest> destT(dest);
SelfadjointProductMatrix<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
Transpose<const Lhs>, 0, true>(m_rhs.transpose(), m_lhs.transpose()).scaleAndAddTo(destT, alpha);
}
};
} // end namespace Eigen
#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment