Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
pydensecrf
Commits
da2c1226
Commit
da2c1226
authored
Nov 13, 2015
by
lucasb-eyer
Browse files
Initial import of code.
parent
3ae01ffb
Changes
177
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
6650 additions
and
0 deletions
+6650
-0
densecrf/include/Eigen/src/Core/arch/NEON/CMakeLists.txt
densecrf/include/Eigen/src/Core/arch/NEON/CMakeLists.txt
+6
-0
densecrf/include/Eigen/src/Core/arch/NEON/Complex.h
densecrf/include/Eigen/src/Core/arch/NEON/Complex.h
+259
-0
densecrf/include/Eigen/src/Core/arch/NEON/PacketMath.h
densecrf/include/Eigen/src/Core/arch/NEON/PacketMath.h
+424
-0
densecrf/include/Eigen/src/Core/arch/SSE/CMakeLists.txt
densecrf/include/Eigen/src/Core/arch/SSE/CMakeLists.txt
+6
-0
densecrf/include/Eigen/src/Core/arch/SSE/Complex.h
densecrf/include/Eigen/src/Core/arch/SSE/Complex.h
+436
-0
densecrf/include/Eigen/src/Core/arch/SSE/MathFunctions.h
densecrf/include/Eigen/src/Core/arch/SSE/MathFunctions.h
+388
-0
densecrf/include/Eigen/src/Core/arch/SSE/PacketMath.h
densecrf/include/Eigen/src/Core/arch/SSE/PacketMath.h
+632
-0
densecrf/include/Eigen/src/Core/products/CMakeLists.txt
densecrf/include/Eigen/src/Core/products/CMakeLists.txt
+6
-0
densecrf/include/Eigen/src/Core/products/CoeffBasedProduct.h
densecrf/include/Eigen/src/Core/products/CoeffBasedProduct.h
+441
-0
densecrf/include/Eigen/src/Core/products/GeneralBlockPanelKernel.h
...include/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+1319
-0
densecrf/include/Eigen/src/Core/products/GeneralMatrixMatrix.h
...crf/include/Eigen/src/Core/products/GeneralMatrixMatrix.h
+428
-0
densecrf/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
...e/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+214
-0
densecrf/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
...gen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
+146
-0
densecrf/include/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
...include/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
+118
-0
densecrf/include/Eigen/src/Core/products/GeneralMatrixVector.h
...crf/include/Eigen/src/Core/products/GeneralMatrixVector.h
+552
-0
densecrf/include/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
...include/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
+131
-0
densecrf/include/Eigen/src/Core/products/Parallelizer.h
densecrf/include/Eigen/src/Core/products/Parallelizer.h
+159
-0
densecrf/include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
...include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+416
-0
densecrf/include/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
...ude/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
+295
-0
densecrf/include/Eigen/src/Core/products/SelfadjointMatrixVector.h
...include/Eigen/src/Core/products/SelfadjointMatrixVector.h
+274
-0
No files found.
Too many changes to show.
To preserve performance only
177 of 177+
files are displayed.
Plain diff
Email patch
densecrf/include/Eigen/src/Core/arch/NEON/CMakeLists.txt
0 → 100644
View file @
da2c1226
FILE
(
GLOB Eigen_Core_arch_NEON_SRCS
"*.h"
)
INSTALL
(
FILES
${
Eigen_Core_arch_NEON_SRCS
}
DESTINATION
${
INCLUDE_INSTALL_DIR
}
/Eigen/src/Core/arch/NEON COMPONENT Devel
)
densecrf/include/Eigen/src/Core/arch/NEON/Complex.h
0 → 100644
View file @
da2c1226
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_NEON_H
#define EIGEN_COMPLEX_NEON_H
namespace
Eigen
{
namespace
internal
{
static
uint32x4_t
p4ui_CONJ_XOR
=
EIGEN_INIT_NEON_PACKET4
(
0x00000000
,
0x80000000
,
0x00000000
,
0x80000000
);
static
uint32x2_t
p2ui_CONJ_XOR
=
EIGEN_INIT_NEON_PACKET2
(
0x00000000
,
0x80000000
);
//---------- float ----------
struct
Packet2cf
{
EIGEN_STRONG_INLINE
Packet2cf
()
{}
EIGEN_STRONG_INLINE
explicit
Packet2cf
(
const
Packet4f
&
a
)
:
v
(
a
)
{}
Packet4f
v
;
};
template
<
>
struct
packet_traits
<
std
::
complex
<
float
>
>
:
default_packet_traits
{
typedef
Packet2cf
type
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasNegate
=
1
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasSetLinear
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet2cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
2
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pset1
<
Packet2cf
>
(
const
std
::
complex
<
float
>&
from
)
{
float32x2_t
r64
;
r64
=
vld1_f32
((
float
*
)
&
from
);
return
Packet2cf
(
vcombine_f32
(
r64
,
r64
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
padd
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
padd
<
Packet4f
>
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
psub
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
psub
<
Packet4f
>
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pnegate
(
const
Packet2cf
&
a
)
{
return
Packet2cf
(
pnegate
<
Packet4f
>
(
a
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pconj
(
const
Packet2cf
&
a
)
{
Packet4ui
b
=
vreinterpretq_u32_f32
(
a
.
v
);
return
Packet2cf
(
vreinterpretq_f32_u32
(
veorq_u32
(
b
,
p4ui_CONJ_XOR
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pmul
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
Packet4f
v1
,
v2
;
float32x2_t
a_lo
,
a_hi
;
// Get the real values of a | a1_re | a1_re | a2_re | a2_re |
v1
=
vcombine_f32
(
vdup_lane_f32
(
vget_low_f32
(
a
.
v
),
0
),
vdup_lane_f32
(
vget_high_f32
(
a
.
v
),
0
));
// Get the real values of a | a1_im | a1_im | a2_im | a2_im |
v2
=
vcombine_f32
(
vdup_lane_f32
(
vget_low_f32
(
a
.
v
),
1
),
vdup_lane_f32
(
vget_high_f32
(
a
.
v
),
1
));
// Multiply the real a with b
v1
=
vmulq_f32
(
v1
,
b
.
v
);
// Multiply the imag a with b
v2
=
vmulq_f32
(
v2
,
b
.
v
);
// Conjugate v2
v2
=
vreinterpretq_f32_u32
(
veorq_u32
(
vreinterpretq_u32_f32
(
v2
),
p4ui_CONJ_XOR
));
// Swap real/imag elements in v2.
a_lo
=
vrev64_f32
(
vget_low_f32
(
v2
));
a_hi
=
vrev64_f32
(
vget_high_f32
(
v2
));
v2
=
vcombine_f32
(
a_lo
,
a_hi
);
// Add and return the result
return
Packet2cf
(
vaddq_f32
(
v1
,
v2
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pand
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vreinterpretq_f32_u32
(
vorrq_u32
(
vreinterpretq_u32_f32
(
a
.
v
),
vreinterpretq_u32_f32
(
b
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
por
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vreinterpretq_f32_u32
(
vorrq_u32
(
vreinterpretq_u32_f32
(
a
.
v
),
vreinterpretq_u32_f32
(
b
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pxor
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vreinterpretq_f32_u32
(
veorq_u32
(
vreinterpretq_u32_f32
(
a
.
v
),
vreinterpretq_u32_f32
(
b
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pandnot
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vreinterpretq_f32_u32
(
vbicq_u32
(
vreinterpretq_u32_f32
(
a
.
v
),
vreinterpretq_u32_f32
(
b
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pload
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
Packet2cf
(
pload
<
Packet4f
>
((
const
float
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploadu
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
Packet2cf
(
ploadu
<
Packet4f
>
((
const
float
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploaddup
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
return
pset1
<
Packet2cf
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
((
float
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
((
float
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
float
>
>
(
const
std
::
complex
<
float
>
*
addr
)
{
__pld
((
float
*
)
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
pfirst
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
std
::
complex
<
float
>
EIGEN_ALIGN16
x
[
2
];
vst1q_f32
((
float
*
)
x
,
a
.
v
);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
preverse
(
const
Packet2cf
&
a
)
{
float32x2_t
a_lo
,
a_hi
;
Packet4f
a_r128
;
a_lo
=
vget_low_f32
(
a
.
v
);
a_hi
=
vget_high_f32
(
a
.
v
);
a_r128
=
vcombine_f32
(
a_hi
,
a_lo
);
return
Packet2cf
(
a_r128
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pcplxflip
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
return
Packet2cf
(
vrev64q_f32
(
a
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
predux
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
float32x2_t
a1
,
a2
;
std
::
complex
<
float
>
s
;
a1
=
vget_low_f32
(
a
.
v
);
a2
=
vget_high_f32
(
a
.
v
);
a2
=
vadd_f32
(
a1
,
a2
);
vst1_f32
((
float
*
)
&
s
,
a2
);
return
s
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
preduxp
<
Packet2cf
>
(
const
Packet2cf
*
vecs
)
{
Packet4f
sum1
,
sum2
,
sum
;
// Add the first two 64-bit float32x2_t of vecs[0]
sum1
=
vcombine_f32
(
vget_low_f32
(
vecs
[
0
].
v
),
vget_low_f32
(
vecs
[
1
].
v
));
sum2
=
vcombine_f32
(
vget_high_f32
(
vecs
[
0
].
v
),
vget_high_f32
(
vecs
[
1
].
v
));
sum
=
vaddq_f32
(
sum1
,
sum2
);
return
Packet2cf
(
sum
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
predux_mul
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
float32x2_t
a1
,
a2
,
v1
,
v2
,
prod
;
std
::
complex
<
float
>
s
;
a1
=
vget_low_f32
(
a
.
v
);
a2
=
vget_high_f32
(
a
.
v
);
// Get the real values of a | a1_re | a1_re | a2_re | a2_re |
v1
=
vdup_lane_f32
(
a1
,
0
);
// Get the real values of a | a1_im | a1_im | a2_im | a2_im |
v2
=
vdup_lane_f32
(
a1
,
1
);
// Multiply the real a with b
v1
=
vmul_f32
(
v1
,
a2
);
// Multiply the imag a with b
v2
=
vmul_f32
(
v2
,
a2
);
// Conjugate v2
v2
=
vreinterpret_f32_u32
(
veor_u32
(
vreinterpret_u32_f32
(
v2
),
p2ui_CONJ_XOR
));
// Swap real/imag elements in v2.
v2
=
vrev64_f32
(
v2
);
// Add v1, v2
prod
=
vadd_f32
(
v1
,
v2
);
vst1_f32
((
float
*
)
&
s
,
prod
);
return
s
;
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet2cf
>
{
EIGEN_STRONG_INLINE
static
void
run
(
Packet2cf
&
first
,
const
Packet2cf
&
second
)
{
if
(
Offset
==
1
)
{
first
.
v
=
vextq_f32
(
first
.
v
,
second
.
v
,
2
);
}
}
};
template
<
>
struct
conj_helper
<
Packet2cf
,
Packet2cf
,
false
,
true
>
{
EIGEN_STRONG_INLINE
Packet2cf
pmadd
(
const
Packet2cf
&
x
,
const
Packet2cf
&
y
,
const
Packet2cf
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet2cf
pmul
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
const
{
return
internal
::
pmul
(
a
,
pconj
(
b
));
}
};
template
<
>
struct
conj_helper
<
Packet2cf
,
Packet2cf
,
true
,
false
>
{
EIGEN_STRONG_INLINE
Packet2cf
pmadd
(
const
Packet2cf
&
x
,
const
Packet2cf
&
y
,
const
Packet2cf
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet2cf
pmul
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
const
{
return
internal
::
pmul
(
pconj
(
a
),
b
);
}
};
template
<
>
struct
conj_helper
<
Packet2cf
,
Packet2cf
,
true
,
true
>
{
EIGEN_STRONG_INLINE
Packet2cf
pmadd
(
const
Packet2cf
&
x
,
const
Packet2cf
&
y
,
const
Packet2cf
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet2cf
pmul
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
const
{
return
pconj
(
internal
::
pmul
(
a
,
b
));
}
};
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pdiv
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
// TODO optimize it for AltiVec
Packet2cf
res
=
conj_helper
<
Packet2cf
,
Packet2cf
,
false
,
true
>
().
pmul
(
a
,
b
);
Packet4f
s
,
rev_s
;
float32x2_t
a_lo
,
a_hi
;
// this computes the norm
s
=
vmulq_f32
(
b
.
v
,
b
.
v
);
a_lo
=
vrev64_f32
(
vget_low_f32
(
s
));
a_hi
=
vrev64_f32
(
vget_high_f32
(
s
));
rev_s
=
vcombine_f32
(
a_lo
,
a_hi
);
return
Packet2cf
(
pdiv
(
res
.
v
,
vaddq_f32
(
s
,
rev_s
)));
}
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_COMPLEX_NEON_H
densecrf/include/Eigen/src/Core/arch/NEON/PacketMath.h
0 → 100644
View file @
da2c1226
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr>
// Heavily based on Gael's SSE version.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_NEON_H
#define EIGEN_PACKET_MATH_NEON_H
namespace
Eigen
{
namespace
internal
{
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
// FIXME NEON has 16 quad registers, but since the current register allocator
// is so bad, it is much better to reduce it to 8
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
#endif
typedef
float32x4_t
Packet4f
;
typedef
int32x4_t
Packet4i
;
typedef
uint32x4_t
Packet4ui
;
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f p4f_##NAME = pset1<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
#if defined(__llvm__) && !defined(__clang__)
//Special treatment for Apple's llvm-gcc, its NEON packet types are unions
#define EIGEN_INIT_NEON_PACKET2(X, Y) {{X, Y}}
#define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
#else
//Default initializer for packets
#define EIGEN_INIT_NEON_PACKET2(X, Y) {X, Y}
#define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
#endif
#ifndef __pld
#define __pld(x) asm volatile ( " pld [%[addr]]\n" :: [addr] "r" (x) : "cc" );
#endif
template
<
>
struct
packet_traits
<
float
>
:
default_packet_traits
{
typedef
Packet4f
type
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
,
HasDiv
=
1
,
// FIXME check the Has*
HasSin
=
0
,
HasCos
=
0
,
HasLog
=
0
,
HasExp
=
0
,
HasSqrt
=
0
};
};
template
<
>
struct
packet_traits
<
int
>
:
default_packet_traits
{
typedef
Packet4i
type
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
// FIXME check the Has*
};
};
#if EIGEN_GNUC_AT_MOST(4,4) && !defined(__llvm__)
// workaround gcc 4.2, 4.3 and 4.4 compilatin issue
EIGEN_STRONG_INLINE
float32x4_t
vld1q_f32
(
const
float
*
x
)
{
return
::
vld1q_f32
((
const
float32_t
*
)
x
);
}
EIGEN_STRONG_INLINE
float32x2_t
vld1_f32
(
const
float
*
x
)
{
return
::
vld1_f32
((
const
float32_t
*
)
x
);
}
EIGEN_STRONG_INLINE
void
vst1q_f32
(
float
*
to
,
float32x4_t
from
)
{
::
vst1q_f32
((
float32_t
*
)
to
,
from
);
}
EIGEN_STRONG_INLINE
void
vst1_f32
(
float
*
to
,
float32x2_t
from
)
{
::
vst1_f32
((
float32_t
*
)
to
,
from
);
}
#endif
template
<
>
struct
unpacket_traits
<
Packet4f
>
{
typedef
float
type
;
enum
{
size
=
4
};
};
template
<
>
struct
unpacket_traits
<
Packet4i
>
{
typedef
int
type
;
enum
{
size
=
4
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pset1
<
Packet4f
>
(
const
float
&
from
)
{
return
vdupq_n_f32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pset1
<
Packet4i
>
(
const
int
&
from
)
{
return
vdupq_n_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
plset
<
float
>
(
const
float
&
a
)
{
Packet4f
countdown
=
EIGEN_INIT_NEON_PACKET4
(
0
,
1
,
2
,
3
);
return
vaddq_f32
(
pset1
<
Packet4f
>
(
a
),
countdown
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
plset
<
int
>
(
const
int
&
a
)
{
Packet4i
countdown
=
EIGEN_INIT_NEON_PACKET4
(
0
,
1
,
2
,
3
);
return
vaddq_s32
(
pset1
<
Packet4i
>
(
a
),
countdown
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
padd
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vaddq_f32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
padd
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vaddq_s32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
psub
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vsubq_f32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
psub
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vsubq_s32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pnegate
(
const
Packet4f
&
a
)
{
return
vnegq_f32
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pnegate
(
const
Packet4i
&
a
)
{
return
vnegq_s32
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmul
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vmulq_f32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmul
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vmulq_s32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pdiv
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
Packet4f
inv
,
restep
,
div
;
// NEON does not offer a divide instruction, we have to do a reciprocal approximation
// However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
// a reciprocal estimate AND a reciprocal step -which saves a few instructions
// vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
// Newton-Raphson and vrecpsq_f32()
inv
=
vrecpeq_f32
(
b
);
// This returns a differential, by which we will have to multiply inv to get a better
// approximation of 1/b.
restep
=
vrecpsq_f32
(
b
,
inv
);
inv
=
vmulq_f32
(
restep
,
inv
);
// Finally, multiply a by 1/b and get the wanted result of the division.
div
=
vmulq_f32
(
a
,
inv
);
return
div
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pdiv
<
Packet4i
>
(
const
Packet4i
&
/*a*/
,
const
Packet4i
&
/*b*/
)
{
eigen_assert
(
false
&&
"packet integer division are not supported by NEON"
);
return
pset1
<
Packet4i
>
(
0
);
}
// for some weird raisons, it has to be overloaded for packet of integers
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmadd
(
const
Packet4f
&
a
,
const
Packet4f
&
b
,
const
Packet4f
&
c
)
{
return
vmlaq_f32
(
c
,
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmadd
(
const
Packet4i
&
a
,
const
Packet4i
&
b
,
const
Packet4i
&
c
)
{
return
vmlaq_s32
(
c
,
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmin
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vminq_f32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmin
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vminq_s32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmax
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vmaxq_f32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmax
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vmaxq_s32
(
a
,
b
);
}
// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pand
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vreinterpretq_f32_u32
(
vandq_u32
(
vreinterpretq_u32_f32
(
a
),
vreinterpretq_u32_f32
(
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pand
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vandq_s32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
por
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vreinterpretq_f32_u32
(
vorrq_u32
(
vreinterpretq_u32_f32
(
a
),
vreinterpretq_u32_f32
(
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
por
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vorrq_s32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pxor
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vreinterpretq_f32_u32
(
veorq_u32
(
vreinterpretq_u32_f32
(
a
),
vreinterpretq_u32_f32
(
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pxor
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
veorq_s32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pandnot
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vreinterpretq_f32_u32
(
vbicq_u32
(
vreinterpretq_u32_f32
(
a
),
vreinterpretq_u32_f32
(
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pandnot
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vbicq_s32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pload
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vld1q_f32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pload
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vld1q_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploadu
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
vld1q_f32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploadu
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
vld1q_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploaddup
<
Packet4f
>
(
const
float
*
from
)
{
float32x2_t
lo
,
hi
;
lo
=
vdup_n_f32
(
*
from
);
hi
=
vdup_n_f32
(
*
(
from
+
1
));
return
vcombine_f32
(
lo
,
hi
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploaddup
<
Packet4i
>
(
const
int
*
from
)
{
int32x2_t
lo
,
hi
;
lo
=
vdup_n_s32
(
*
from
);
hi
=
vdup_n_s32
(
*
(
from
+
1
));
return
vcombine_s32
(
lo
,
hi
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vst1q_f32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vst1q_s32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
vst1q_f32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
vst1q_s32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
float
>
(
const
float
*
addr
)
{
__pld
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
int
>
(
const
int
*
addr
)
{
__pld
(
addr
);
}
// FIXME only store the 2 first elements ?
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float
EIGEN_ALIGN16
x
[
4
];
vst1q_f32
(
x
,
a
);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
int
pfirst
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int
EIGEN_ALIGN16
x
[
4
];
vst1q_s32
(
x
,
a
);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
preverse
(
const
Packet4f
&
a
)
{
float32x2_t
a_lo
,
a_hi
;
Packet4f
a_r64
;
a_r64
=
vrev64q_f32
(
a
);
a_lo
=
vget_low_f32
(
a_r64
);
a_hi
=
vget_high_f32
(
a_r64
);
return
vcombine_f32
(
a_hi
,
a_lo
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
preverse
(
const
Packet4i
&
a
)
{
int32x2_t
a_lo
,
a_hi
;
Packet4i
a_r64
;
a_r64
=
vrev64q_s32
(
a
);
a_lo
=
vget_low_s32
(
a_r64
);
a_hi
=
vget_high_s32
(
a_r64
);
return
vcombine_s32
(
a_hi
,
a_lo
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pabs
(
const
Packet4f
&
a
)
{
return
vabsq_f32
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pabs
(
const
Packet4i
&
a
)
{
return
vabsq_s32
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
float
predux
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float32x2_t
a_lo
,
a_hi
,
sum
;
float
s
[
2
];
a_lo
=
vget_low_f32
(
a
);
a_hi
=
vget_high_f32
(
a
);
sum
=
vpadd_f32
(
a_lo
,
a_hi
);
sum
=
vpadd_f32
(
sum
,
sum
);
vst1_f32
(
s
,
sum
);
return
s
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
preduxp
<
Packet4f
>
(
const
Packet4f
*
vecs
)
{
float32x4x2_t
vtrn1
,
vtrn2
,
res1
,
res2
;
Packet4f
sum1
,
sum2
,
sum
;
// NEON zip performs interleaving of the supplied vectors.
// We perform two interleaves in a row to acquire the transposed vector
vtrn1
=
vzipq_f32
(
vecs
[
0
],
vecs
[
2
]);
vtrn2
=
vzipq_f32
(
vecs
[
1
],
vecs
[
3
]);
res1
=
vzipq_f32
(
vtrn1
.
val
[
0
],
vtrn2
.
val
[
0
]);
res2
=
vzipq_f32
(
vtrn1
.
val
[
1
],
vtrn2
.
val
[
1
]);
// Do the addition of the resulting vectors
sum1
=
vaddq_f32
(
res1
.
val
[
0
],
res1
.
val
[
1
]);
sum2
=
vaddq_f32
(
res2
.
val
[
0
],
res2
.
val
[
1
]);
sum
=
vaddq_f32
(
sum1
,
sum2
);
return
sum
;
}
template
<
>
EIGEN_STRONG_INLINE
int
predux
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int32x2_t
a_lo
,
a_hi
,
sum
;
int32_t
s
[
2
];
a_lo
=
vget_low_s32
(
a
);
a_hi
=
vget_high_s32
(
a
);
sum
=
vpadd_s32
(
a_lo
,
a_hi
);
sum
=
vpadd_s32
(
sum
,
sum
);
vst1_s32
(
s
,
sum
);
return
s
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
preduxp
<
Packet4i
>
(
const
Packet4i
*
vecs
)
{
int32x4x2_t
vtrn1
,
vtrn2
,
res1
,
res2
;
Packet4i
sum1
,
sum2
,
sum
;
// NEON zip performs interleaving of the supplied vectors.
// We perform two interleaves in a row to acquire the transposed vector
vtrn1
=
vzipq_s32
(
vecs
[
0
],
vecs
[
2
]);
vtrn2
=
vzipq_s32
(
vecs
[
1
],
vecs
[
3
]);
res1
=
vzipq_s32
(
vtrn1
.
val
[
0
],
vtrn2
.
val
[
0
]);
res2
=
vzipq_s32
(
vtrn1
.
val
[
1
],
vtrn2
.
val
[
1
]);
// Do the addition of the resulting vectors
sum1
=
vaddq_s32
(
res1
.
val
[
0
],
res1
.
val
[
1
]);
sum2
=
vaddq_s32
(
res2
.
val
[
0
],
res2
.
val
[
1
]);
sum
=
vaddq_s32
(
sum1
,
sum2
);
return
sum
;
}
// Other reduction functions:
// mul
template
<
>
EIGEN_STRONG_INLINE
float
predux_mul
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float32x2_t
a_lo
,
a_hi
,
prod
;
float
s
[
2
];
// Get a_lo = |a1|a2| and a_hi = |a3|a4|
a_lo
=
vget_low_f32
(
a
);
a_hi
=
vget_high_f32
(
a
);
// Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
prod
=
vmul_f32
(
a_lo
,
a_hi
);
// Multiply prod with its swapped value |a2*a4|a1*a3|
prod
=
vmul_f32
(
prod
,
vrev64_f32
(
prod
));
vst1_f32
(
s
,
prod
);
return
s
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
int
predux_mul
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int32x2_t
a_lo
,
a_hi
,
prod
;
int32_t
s
[
2
];
// Get a_lo = |a1|a2| and a_hi = |a3|a4|
a_lo
=
vget_low_s32
(
a
);
a_hi
=
vget_high_s32
(
a
);
// Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
prod
=
vmul_s32
(
a_lo
,
a_hi
);
// Multiply prod with its swapped value |a2*a4|a1*a3|
prod
=
vmul_s32
(
prod
,
vrev64_s32
(
prod
));
vst1_s32
(
s
,
prod
);
return
s
[
0
];
}
// min
template
<
>
EIGEN_STRONG_INLINE
float
predux_min
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float32x2_t
a_lo
,
a_hi
,
min
;
float
s
[
2
];
a_lo
=
vget_low_f32
(
a
);
a_hi
=
vget_high_f32
(
a
);
min
=
vpmin_f32
(
a_lo
,
a_hi
);
min
=
vpmin_f32
(
min
,
min
);
vst1_f32
(
s
,
min
);
return
s
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
int
predux_min
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int32x2_t
a_lo
,
a_hi
,
min
;
int32_t
s
[
2
];
a_lo
=
vget_low_s32
(
a
);
a_hi
=
vget_high_s32
(
a
);
min
=
vpmin_s32
(
a_lo
,
a_hi
);
min
=
vpmin_s32
(
min
,
min
);
vst1_s32
(
s
,
min
);
return
s
[
0
];
}
// max
template
<
>
EIGEN_STRONG_INLINE
float
predux_max
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float32x2_t
a_lo
,
a_hi
,
max
;
float
s
[
2
];
a_lo
=
vget_low_f32
(
a
);
a_hi
=
vget_high_f32
(
a
);
max
=
vpmax_f32
(
a_lo
,
a_hi
);
max
=
vpmax_f32
(
max
,
max
);
vst1_f32
(
s
,
max
);
return
s
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
int
predux_max
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int32x2_t
a_lo
,
a_hi
,
max
;
int32_t
s
[
2
];
a_lo
=
vget_low_s32
(
a
);
a_hi
=
vget_high_s32
(
a
);
max
=
vpmax_s32
(
a_lo
,
a_hi
);
max
=
vpmax_s32
(
max
,
max
);
vst1_s32
(
s
,
max
);
return
s
[
0
];
}
// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
#define PALIGN_NEON(Offset,Type,Command) \
template<>\
struct palign_impl<Offset,Type>\
{\
EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
{\
if (Offset!=0)\
first = Command(first, second, Offset);\
}\
};\
PALIGN_NEON
(
0
,
Packet4f
,
vextq_f32
)
PALIGN_NEON
(
1
,
Packet4f
,
vextq_f32
)
PALIGN_NEON
(
2
,
Packet4f
,
vextq_f32
)
PALIGN_NEON
(
3
,
Packet4f
,
vextq_f32
)
PALIGN_NEON
(
0
,
Packet4i
,
vextq_s32
)
PALIGN_NEON
(
1
,
Packet4i
,
vextq_s32
)
PALIGN_NEON
(
2
,
Packet4i
,
vextq_s32
)
PALIGN_NEON
(
3
,
Packet4i
,
vextq_s32
)
#undef PALIGN_NEON
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_PACKET_MATH_NEON_H
densecrf/include/Eigen/src/Core/arch/SSE/CMakeLists.txt
0 → 100644
View file @
da2c1226
FILE
(
GLOB Eigen_Core_arch_SSE_SRCS
"*.h"
)
INSTALL
(
FILES
${
Eigen_Core_arch_SSE_SRCS
}
DESTINATION
${
INCLUDE_INSTALL_DIR
}
/Eigen/src/Core/arch/SSE COMPONENT Devel
)
densecrf/include/Eigen/src/Core/arch/SSE/Complex.h
0 → 100644
View file @
da2c1226
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_SSE_H
#define EIGEN_COMPLEX_SSE_H
namespace
Eigen
{
namespace
internal
{
//---------- float ----------
struct
Packet2cf
{
EIGEN_STRONG_INLINE
Packet2cf
()
{}
EIGEN_STRONG_INLINE
explicit
Packet2cf
(
const
__m128
&
a
)
:
v
(
a
)
{}
__m128
v
;
};
template
<
>
struct
packet_traits
<
std
::
complex
<
float
>
>
:
default_packet_traits
{
typedef
Packet2cf
type
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasNegate
=
1
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasSetLinear
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet2cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
2
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
padd
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
_mm_add_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
psub
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
_mm_sub_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pnegate
(
const
Packet2cf
&
a
)
{
const
__m128
mask
=
_mm_castsi128_ps
(
_mm_setr_epi32
(
0x80000000
,
0x80000000
,
0x80000000
,
0x80000000
));
return
Packet2cf
(
_mm_xor_ps
(
a
.
v
,
mask
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pconj
(
const
Packet2cf
&
a
)
{
const
__m128
mask
=
_mm_castsi128_ps
(
_mm_setr_epi32
(
0x00000000
,
0x80000000
,
0x00000000
,
0x80000000
));
return
Packet2cf
(
_mm_xor_ps
(
a
.
v
,
mask
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pmul
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
// TODO optimize it for SSE3 and 4
#ifdef EIGEN_VECTORIZE_SSE3
return
Packet2cf
(
_mm_addsub_ps
(
_mm_mul_ps
(
_mm_moveldup_ps
(
a
.
v
),
b
.
v
),
_mm_mul_ps
(
_mm_movehdup_ps
(
a
.
v
),
vec4f_swizzle1
(
b
.
v
,
1
,
0
,
3
,
2
))));
// return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
// _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
// vec4f_swizzle1(b.v, 1, 0, 3, 2))));
#else
const
__m128
mask
=
_mm_castsi128_ps
(
_mm_setr_epi32
(
0x80000000
,
0x00000000
,
0x80000000
,
0x00000000
));
return
Packet2cf
(
_mm_add_ps
(
_mm_mul_ps
(
vec4f_swizzle1
(
a
.
v
,
0
,
0
,
2
,
2
),
b
.
v
),
_mm_xor_ps
(
_mm_mul_ps
(
vec4f_swizzle1
(
a
.
v
,
1
,
1
,
3
,
3
),
vec4f_swizzle1
(
b
.
v
,
1
,
0
,
3
,
2
)),
mask
)));
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pand
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
_mm_and_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
por
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
_mm_or_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pxor
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
_mm_xor_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pandnot
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
_mm_andnot_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pload
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
Packet2cf
(
pload
<
Packet4f
>
(
&
real_ref
(
*
from
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploadu
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
Packet2cf
(
ploadu
<
Packet4f
>
(
&
real_ref
(
*
from
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pset1
<
Packet2cf
>
(
const
std
::
complex
<
float
>&
from
)
{
Packet2cf
res
;
#if EIGEN_GNUC_AT_MOST(4,2)
// workaround annoying "may be used uninitialized in this function" warning with gcc 4.2
res
.
v
=
_mm_loadl_pi
(
_mm_set1_ps
(
0.0
f
),
reinterpret_cast
<
const
__m64
*>
(
&
from
));
#else
res
.
v
=
_mm_loadl_pi
(
res
.
v
,
(
const
__m64
*
)
&
from
);
#endif
return
Packet2cf
(
_mm_movelh_ps
(
res
.
v
,
res
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploaddup
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
return
pset1
<
Packet2cf
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
(
&
real_ref
(
*
to
),
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
(
&
real_ref
(
*
to
),
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
float
>
>
(
const
std
::
complex
<
float
>
*
addr
)
{
_mm_prefetch
((
const
char
*
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
pfirst
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
#if EIGEN_GNUC_AT_MOST(4,3)
// Workaround gcc 4.2 ICE - this is not performance wise ideal, but who cares...
// This workaround also fix invalid code generation with gcc 4.3
EIGEN_ALIGN16
std
::
complex
<
float
>
res
[
2
];
_mm_store_ps
((
float
*
)
res
,
a
.
v
);
return
res
[
0
];
#else
std
::
complex
<
float
>
res
;
_mm_storel_pi
((
__m64
*
)
&
res
,
a
.
v
);
return
res
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
preverse
(
const
Packet2cf
&
a
)
{
return
Packet2cf
(
_mm_castpd_ps
(
preverse
(
_mm_castps_pd
(
a
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
predux
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
return
pfirst
(
Packet2cf
(
_mm_add_ps
(
a
.
v
,
_mm_movehl_ps
(
a
.
v
,
a
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
preduxp
<
Packet2cf
>
(
const
Packet2cf
*
vecs
)
{
return
Packet2cf
(
_mm_add_ps
(
_mm_movelh_ps
(
vecs
[
0
].
v
,
vecs
[
1
].
v
),
_mm_movehl_ps
(
vecs
[
1
].
v
,
vecs
[
0
].
v
)));
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
predux_mul
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
return
pfirst
(
pmul
(
a
,
Packet2cf
(
_mm_movehl_ps
(
a
.
v
,
a
.
v
))));
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet2cf
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet2cf
&
first
,
const
Packet2cf
&
second
)
{
if
(
Offset
==
1
)
{
first
.
v
=
_mm_movehl_ps
(
first
.
v
,
first
.
v
);
first
.
v
=
_mm_movelh_ps
(
first
.
v
,
second
.
v
);
}
}
};
template
<
>
struct
conj_helper
<
Packet2cf
,
Packet2cf
,
false
,
true
>
{
EIGEN_STRONG_INLINE
Packet2cf
pmadd
(
const
Packet2cf
&
x
,
const
Packet2cf
&
y
,
const
Packet2cf
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet2cf
pmul
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
const
{
#ifdef EIGEN_VECTORIZE_SSE3
return
internal
::
pmul
(
a
,
pconj
(
b
));
#else
const
__m128
mask
=
_mm_castsi128_ps
(
_mm_setr_epi32
(
0x00000000
,
0x80000000
,
0x00000000
,
0x80000000
));
return
Packet2cf
(
_mm_add_ps
(
_mm_xor_ps
(
_mm_mul_ps
(
vec4f_swizzle1
(
a
.
v
,
0
,
0
,
2
,
2
),
b
.
v
),
mask
),
_mm_mul_ps
(
vec4f_swizzle1
(
a
.
v
,
1
,
1
,
3
,
3
),
vec4f_swizzle1
(
b
.
v
,
1
,
0
,
3
,
2
))));
#endif
}
};
template
<
>
struct
conj_helper
<
Packet2cf
,
Packet2cf
,
true
,
false
>
{
EIGEN_STRONG_INLINE
Packet2cf
pmadd
(
const
Packet2cf
&
x
,
const
Packet2cf
&
y
,
const
Packet2cf
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet2cf
pmul
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
const
{
#ifdef EIGEN_VECTORIZE_SSE3
return
internal
::
pmul
(
pconj
(
a
),
b
);
#else
const
__m128
mask
=
_mm_castsi128_ps
(
_mm_setr_epi32
(
0x00000000
,
0x80000000
,
0x00000000
,
0x80000000
));
return
Packet2cf
(
_mm_add_ps
(
_mm_mul_ps
(
vec4f_swizzle1
(
a
.
v
,
0
,
0
,
2
,
2
),
b
.
v
),
_mm_xor_ps
(
_mm_mul_ps
(
vec4f_swizzle1
(
a
.
v
,
1
,
1
,
3
,
3
),
vec4f_swizzle1
(
b
.
v
,
1
,
0
,
3
,
2
)),
mask
)));
#endif
}
};
template
<
>
struct
conj_helper
<
Packet2cf
,
Packet2cf
,
true
,
true
>
{
EIGEN_STRONG_INLINE
Packet2cf
pmadd
(
const
Packet2cf
&
x
,
const
Packet2cf
&
y
,
const
Packet2cf
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet2cf
pmul
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
const
{
#ifdef EIGEN_VECTORIZE_SSE3
return
pconj
(
internal
::
pmul
(
a
,
b
));
#else
const
__m128
mask
=
_mm_castsi128_ps
(
_mm_setr_epi32
(
0x00000000
,
0x80000000
,
0x00000000
,
0x80000000
));
return
Packet2cf
(
_mm_sub_ps
(
_mm_xor_ps
(
_mm_mul_ps
(
vec4f_swizzle1
(
a
.
v
,
0
,
0
,
2
,
2
),
b
.
v
),
mask
),
_mm_mul_ps
(
vec4f_swizzle1
(
a
.
v
,
1
,
1
,
3
,
3
),
vec4f_swizzle1
(
b
.
v
,
1
,
0
,
3
,
2
))));
#endif
}
};
template
<
>
struct
conj_helper
<
Packet4f
,
Packet2cf
,
false
,
false
>
{
EIGEN_STRONG_INLINE
Packet2cf
pmadd
(
const
Packet4f
&
x
,
const
Packet2cf
&
y
,
const
Packet2cf
&
c
)
const
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet2cf
pmul
(
const
Packet4f
&
x
,
const
Packet2cf
&
y
)
const
{
return
Packet2cf
(
Eigen
::
internal
::
pmul
(
x
,
y
.
v
));
}
};
template
<
>
struct
conj_helper
<
Packet2cf
,
Packet4f
,
false
,
false
>
{
EIGEN_STRONG_INLINE
Packet2cf
pmadd
(
const
Packet2cf
&
x
,
const
Packet4f
&
y
,
const
Packet2cf
&
c
)
const
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet2cf
pmul
(
const
Packet2cf
&
x
,
const
Packet4f
&
y
)
const
{
return
Packet2cf
(
Eigen
::
internal
::
pmul
(
x
.
v
,
y
));
}
};
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pdiv
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
// TODO optimize it for SSE3 and 4
Packet2cf
res
=
conj_helper
<
Packet2cf
,
Packet2cf
,
false
,
true
>
().
pmul
(
a
,
b
);
__m128
s
=
_mm_mul_ps
(
b
.
v
,
b
.
v
);
return
Packet2cf
(
_mm_div_ps
(
res
.
v
,
_mm_add_ps
(
s
,
_mm_castsi128_ps
(
_mm_shuffle_epi32
(
_mm_castps_si128
(
s
),
0xb1
)))));
}
EIGEN_STRONG_INLINE
Packet2cf
pcplxflip
/*<Packet2cf>*/
(
const
Packet2cf
&
x
)
{
return
Packet2cf
(
vec4f_swizzle1
(
x
.
v
,
1
,
0
,
3
,
2
));
}
//---------- double ----------
struct
Packet1cd
{
EIGEN_STRONG_INLINE
Packet1cd
()
{}
EIGEN_STRONG_INLINE
explicit
Packet1cd
(
const
__m128d
&
a
)
:
v
(
a
)
{}
__m128d
v
;
};
template
<
>
struct
packet_traits
<
std
::
complex
<
double
>
>
:
default_packet_traits
{
typedef
Packet1cd
type
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
0
,
size
=
1
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasNegate
=
1
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasSetLinear
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet1cd
>
{
typedef
std
::
complex
<
double
>
type
;
enum
{
size
=
1
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
padd
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
_mm_add_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
psub
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
_mm_sub_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pnegate
(
const
Packet1cd
&
a
)
{
return
Packet1cd
(
pnegate
(
a
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pconj
(
const
Packet1cd
&
a
)
{
const
__m128d
mask
=
_mm_castsi128_pd
(
_mm_set_epi32
(
0x80000000
,
0x0
,
0x0
,
0x0
));
return
Packet1cd
(
_mm_xor_pd
(
a
.
v
,
mask
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pmul
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
// TODO optimize it for SSE3 and 4
#ifdef EIGEN_VECTORIZE_SSE3
return
Packet1cd
(
_mm_addsub_pd
(
_mm_mul_pd
(
vec2d_swizzle1
(
a
.
v
,
0
,
0
),
b
.
v
),
_mm_mul_pd
(
vec2d_swizzle1
(
a
.
v
,
1
,
1
),
vec2d_swizzle1
(
b
.
v
,
1
,
0
))));
#else
const
__m128d
mask
=
_mm_castsi128_pd
(
_mm_set_epi32
(
0x0
,
0x0
,
0x80000000
,
0x0
));
return
Packet1cd
(
_mm_add_pd
(
_mm_mul_pd
(
vec2d_swizzle1
(
a
.
v
,
0
,
0
),
b
.
v
),
_mm_xor_pd
(
_mm_mul_pd
(
vec2d_swizzle1
(
a
.
v
,
1
,
1
),
vec2d_swizzle1
(
b
.
v
,
1
,
0
)),
mask
)));
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pand
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
_mm_and_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
por
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
_mm_or_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pxor
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
_mm_xor_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pandnot
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
_mm_andnot_pd
(
a
.
v
,
b
.
v
));
}
// FIXME force unaligned load, this is a temporary fix
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pload
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
Packet1cd
(
pload
<
Packet2d
>
((
const
double
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
ploadu
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
Packet1cd
(
ploadu
<
Packet2d
>
((
const
double
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pset1
<
Packet1cd
>
(
const
std
::
complex
<
double
>&
from
)
{
/* here we really have to use unaligned loads :( */
return
ploadu
<
Packet1cd
>
(
&
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
ploaddup
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
return
pset1
<
Packet1cd
>
(
*
from
);
}
// FIXME force unaligned store, this is a temporary fix
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
double
>
>
(
const
std
::
complex
<
double
>
*
addr
)
{
_mm_prefetch
((
const
char
*
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
pfirst
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
EIGEN_ALIGN16
double
res
[
2
];
_mm_store_pd
(
res
,
a
.
v
);
return
std
::
complex
<
double
>
(
res
[
0
],
res
[
1
]);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
preverse
(
const
Packet1cd
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
predux
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
return
pfirst
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
preduxp
<
Packet1cd
>
(
const
Packet1cd
*
vecs
)
{
return
vecs
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
predux_mul
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
return
pfirst
(
a
);
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet1cd
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet1cd
&
/*first*/
,
const
Packet1cd
&
/*second*/
)
{
// FIXME is it sure we never have to align a Packet1cd?
// Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
false
,
true
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
#ifdef EIGEN_VECTORIZE_SSE3
return
internal
::
pmul
(
a
,
pconj
(
b
));
#else
const
__m128d
mask
=
_mm_castsi128_pd
(
_mm_set_epi32
(
0x80000000
,
0x0
,
0x0
,
0x0
));
return
Packet1cd
(
_mm_add_pd
(
_mm_xor_pd
(
_mm_mul_pd
(
vec2d_swizzle1
(
a
.
v
,
0
,
0
),
b
.
v
),
mask
),
_mm_mul_pd
(
vec2d_swizzle1
(
a
.
v
,
1
,
1
),
vec2d_swizzle1
(
b
.
v
,
1
,
0
))));
#endif
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
true
,
false
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
#ifdef EIGEN_VECTORIZE_SSE3
return
internal
::
pmul
(
pconj
(
a
),
b
);
#else
const
__m128d
mask
=
_mm_castsi128_pd
(
_mm_set_epi32
(
0x80000000
,
0x0
,
0x0
,
0x0
));
return
Packet1cd
(
_mm_add_pd
(
_mm_mul_pd
(
vec2d_swizzle1
(
a
.
v
,
0
,
0
),
b
.
v
),
_mm_xor_pd
(
_mm_mul_pd
(
vec2d_swizzle1
(
a
.
v
,
1
,
1
),
vec2d_swizzle1
(
b
.
v
,
1
,
0
)),
mask
)));
#endif
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
true
,
true
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
#ifdef EIGEN_VECTORIZE_SSE3
return
pconj
(
internal
::
pmul
(
a
,
b
));
#else
const
__m128d
mask
=
_mm_castsi128_pd
(
_mm_set_epi32
(
0x80000000
,
0x0
,
0x0
,
0x0
));
return
Packet1cd
(
_mm_sub_pd
(
_mm_xor_pd
(
_mm_mul_pd
(
vec2d_swizzle1
(
a
.
v
,
0
,
0
),
b
.
v
),
mask
),
_mm_mul_pd
(
vec2d_swizzle1
(
a
.
v
,
1
,
1
),
vec2d_swizzle1
(
b
.
v
,
1
,
0
))));
#endif
}
};
template
<
>
struct
conj_helper
<
Packet2d
,
Packet1cd
,
false
,
false
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet2d
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet2d
&
x
,
const
Packet1cd
&
y
)
const
{
return
Packet1cd
(
Eigen
::
internal
::
pmul
(
x
,
y
.
v
));
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet2d
,
false
,
false
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet2d
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
x
,
const
Packet2d
&
y
)
const
{
return
Packet1cd
(
Eigen
::
internal
::
pmul
(
x
.
v
,
y
));
}
};
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pdiv
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
// TODO optimize it for SSE3 and 4
Packet1cd
res
=
conj_helper
<
Packet1cd
,
Packet1cd
,
false
,
true
>
().
pmul
(
a
,
b
);
__m128d
s
=
_mm_mul_pd
(
b
.
v
,
b
.
v
);
return
Packet1cd
(
_mm_div_pd
(
res
.
v
,
_mm_add_pd
(
s
,
_mm_shuffle_pd
(
s
,
s
,
0x1
))));
}
EIGEN_STRONG_INLINE
Packet1cd
pcplxflip
/*<Packet1cd>*/
(
const
Packet1cd
&
x
)
{
return
Packet1cd
(
preverse
(
x
.
v
));
}
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_COMPLEX_SSE_H
densecrf/include/Eigen/src/Core/arch/SSE/MathFunctions.h
0 → 100644
View file @
da2c1226
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2007 Julien Pommier
// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/* The sin, cos, exp, and log functions of this file come from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
#ifndef EIGEN_MATH_FUNCTIONS_SSE_H
#define EIGEN_MATH_FUNCTIONS_SSE_H
namespace
Eigen
{
namespace
internal
{
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
plog
<
Packet4f
>
(
const
Packet4f
&
_x
)
{
Packet4f
x
=
_x
;
_EIGEN_DECLARE_CONST_Packet4f
(
1
,
1.0
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
half
,
0.5
f
);
_EIGEN_DECLARE_CONST_Packet4i
(
0x7f
,
0x7f
);
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
inv_mant_mask
,
~
0x7f800000
);
/* the smallest non denormalized float number */
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
min_norm_pos
,
0x00800000
);
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
minus_inf
,
0xff800000
);
//-1.f/0.f);
/* natural logarithm computed for 4 simultaneous float
return NaN for x <= 0
*/
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_SQRTHF
,
0.707106781186547524
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p0
,
7.0376836292E-2
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p1
,
-
1.1514610310E-1
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p2
,
1.1676998740E-1
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p3
,
-
1.2420140846E-1
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p4
,
+
1.4249322787E-1
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p5
,
-
1.6668057665E-1
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p6
,
+
2.0000714765E-1
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p7
,
-
2.4999993993E-1
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p8
,
+
3.3333331174E-1
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_q1
,
-
2.12194440e-4
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_q2
,
0.693359375
f
);
Packet4i
emm0
;
Packet4f
invalid_mask
=
_mm_cmplt_ps
(
x
,
_mm_setzero_ps
());
Packet4f
iszero_mask
=
_mm_cmpeq_ps
(
x
,
_mm_setzero_ps
());
x
=
pmax
(
x
,
p4f_min_norm_pos
);
/* cut off denormalized stuff */
emm0
=
_mm_srli_epi32
(
_mm_castps_si128
(
x
),
23
);
/* keep only the fractional part */
x
=
_mm_and_ps
(
x
,
p4f_inv_mant_mask
);
x
=
_mm_or_ps
(
x
,
p4f_half
);
emm0
=
_mm_sub_epi32
(
emm0
,
p4i_0x7f
);
Packet4f
e
=
padd
(
_mm_cvtepi32_ps
(
emm0
),
p4f_1
);
/* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
Packet4f
mask
=
_mm_cmplt_ps
(
x
,
p4f_cephes_SQRTHF
);
Packet4f
tmp
=
_mm_and_ps
(
x
,
mask
);
x
=
psub
(
x
,
p4f_1
);
e
=
psub
(
e
,
_mm_and_ps
(
p4f_1
,
mask
));
x
=
padd
(
x
,
tmp
);
Packet4f
x2
=
pmul
(
x
,
x
);
Packet4f
x3
=
pmul
(
x2
,
x
);
Packet4f
y
,
y1
,
y2
;
y
=
pmadd
(
p4f_cephes_log_p0
,
x
,
p4f_cephes_log_p1
);
y1
=
pmadd
(
p4f_cephes_log_p3
,
x
,
p4f_cephes_log_p4
);
y2
=
pmadd
(
p4f_cephes_log_p6
,
x
,
p4f_cephes_log_p7
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_log_p2
);
y1
=
pmadd
(
y1
,
x
,
p4f_cephes_log_p5
);
y2
=
pmadd
(
y2
,
x
,
p4f_cephes_log_p8
);
y
=
pmadd
(
y
,
x3
,
y1
);
y
=
pmadd
(
y
,
x3
,
y2
);
y
=
pmul
(
y
,
x3
);
y1
=
pmul
(
e
,
p4f_cephes_log_q1
);
tmp
=
pmul
(
x2
,
p4f_half
);
y
=
padd
(
y
,
y1
);
x
=
psub
(
x
,
tmp
);
y2
=
pmul
(
e
,
p4f_cephes_log_q2
);
x
=
padd
(
x
,
y
);
x
=
padd
(
x
,
y2
);
// negative arg will be NAN, 0 will be -INF
return
_mm_or_ps
(
_mm_andnot_ps
(
iszero_mask
,
_mm_or_ps
(
x
,
invalid_mask
)),
_mm_and_ps
(
iszero_mask
,
p4f_minus_inf
));
}
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
pexp
<
Packet4f
>
(
const
Packet4f
&
_x
)
{
Packet4f
x
=
_x
;
_EIGEN_DECLARE_CONST_Packet4f
(
1
,
1.0
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
half
,
0.5
f
);
_EIGEN_DECLARE_CONST_Packet4i
(
0x7f
,
0x7f
);
_EIGEN_DECLARE_CONST_Packet4f
(
exp_hi
,
88.3762626647950
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
exp_lo
,
-
88.3762626647949
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_LOG2EF
,
1.44269504088896341
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_C1
,
0.693359375
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_C2
,
-
2.12194440e-4
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p0
,
1.9875691500E-4
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p1
,
1.3981999507E-3
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p2
,
8.3334519073E-3
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p3
,
4.1665795894E-2
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p4
,
1.6666665459E-1
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p5
,
5.0000001201E-1
f
);
Packet4f
tmp
=
_mm_setzero_ps
(),
fx
;
Packet4i
emm0
;
// clamp x
x
=
pmax
(
pmin
(
x
,
p4f_exp_hi
),
p4f_exp_lo
);
/* express exp(x) as exp(g + n*log(2)) */
fx
=
pmadd
(
x
,
p4f_cephes_LOG2EF
,
p4f_half
);
/* how to perform a floorf with SSE: just below */
emm0
=
_mm_cvttps_epi32
(
fx
);
tmp
=
_mm_cvtepi32_ps
(
emm0
);
/* if greater, substract 1 */
Packet4f
mask
=
_mm_cmpgt_ps
(
tmp
,
fx
);
mask
=
_mm_and_ps
(
mask
,
p4f_1
);
fx
=
psub
(
tmp
,
mask
);
tmp
=
pmul
(
fx
,
p4f_cephes_exp_C1
);
Packet4f
z
=
pmul
(
fx
,
p4f_cephes_exp_C2
);
x
=
psub
(
x
,
tmp
);
x
=
psub
(
x
,
z
);
z
=
pmul
(
x
,
x
);
Packet4f
y
=
p4f_cephes_exp_p0
;
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p1
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p2
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p3
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p4
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p5
);
y
=
pmadd
(
y
,
z
,
x
);
y
=
padd
(
y
,
p4f_1
);
// build 2^n
emm0
=
_mm_cvttps_epi32
(
fx
);
emm0
=
_mm_add_epi32
(
emm0
,
p4i_0x7f
);
emm0
=
_mm_slli_epi32
(
emm0
,
23
);
return
pmul
(
y
,
_mm_castsi128_ps
(
emm0
));
}
/* evaluation of 4 sines at onces, using SSE2 intrinsics.
The code is the exact rewriting of the cephes sinf function.
Precision is excellent as long as x < 8192 (I did not bother to
take into account the special handling they have for greater values
-- it does not return garbage for arguments over 8192, though, but
the extra precision is missing).
Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
surprising but correct result.
*/
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
psin
<
Packet4f
>
(
const
Packet4f
&
_x
)
{
Packet4f
x
=
_x
;
_EIGEN_DECLARE_CONST_Packet4f
(
1
,
1.0
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
half
,
0.5
f
);
_EIGEN_DECLARE_CONST_Packet4i
(
1
,
1
);
_EIGEN_DECLARE_CONST_Packet4i
(
not1
,
~
1
);
_EIGEN_DECLARE_CONST_Packet4i
(
2
,
2
);
_EIGEN_DECLARE_CONST_Packet4i
(
4
,
4
);
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
sign_mask
,
0x80000000
);
_EIGEN_DECLARE_CONST_Packet4f
(
minus_cephes_DP1
,
-
0.78515625
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
minus_cephes_DP2
,
-
2.4187564849853515625e-4
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
minus_cephes_DP3
,
-
3.77489497744594108e-8
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
sincof_p0
,
-
1.9515295891E-4
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
sincof_p1
,
8.3321608736E-3
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
sincof_p2
,
-
1.6666654611E-1
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
coscof_p0
,
2.443315711809948E-005
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
coscof_p1
,
-
1.388731625493765E-003
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
coscof_p2
,
4.166664568298827E-002
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_FOPI
,
1.27323954473516
f
);
// 4 / M_PI
Packet4f
xmm1
,
xmm2
=
_mm_setzero_ps
(),
xmm3
,
sign_bit
,
y
;
Packet4i
emm0
,
emm2
;
sign_bit
=
x
;
/* take the absolute value */
x
=
pabs
(
x
);
/* take the modulo */
/* extract the sign bit (upper one) */
sign_bit
=
_mm_and_ps
(
sign_bit
,
p4f_sign_mask
);
/* scale by 4/Pi */
y
=
pmul
(
x
,
p4f_cephes_FOPI
);
/* store the integer part of y in mm0 */
emm2
=
_mm_cvttps_epi32
(
y
);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2
=
_mm_add_epi32
(
emm2
,
p4i_1
);
emm2
=
_mm_and_si128
(
emm2
,
p4i_not1
);
y
=
_mm_cvtepi32_ps
(
emm2
);
/* get the swap sign flag */
emm0
=
_mm_and_si128
(
emm2
,
p4i_4
);
emm0
=
_mm_slli_epi32
(
emm0
,
29
);
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
and another one for Pi/4<x<=Pi/2
Both branches will be computed.
*/
emm2
=
_mm_and_si128
(
emm2
,
p4i_2
);
emm2
=
_mm_cmpeq_epi32
(
emm2
,
_mm_setzero_si128
());
Packet4f
swap_sign_bit
=
_mm_castsi128_ps
(
emm0
);
Packet4f
poly_mask
=
_mm_castsi128_ps
(
emm2
);
sign_bit
=
_mm_xor_ps
(
sign_bit
,
swap_sign_bit
);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1
=
pmul
(
y
,
p4f_minus_cephes_DP1
);
xmm2
=
pmul
(
y
,
p4f_minus_cephes_DP2
);
xmm3
=
pmul
(
y
,
p4f_minus_cephes_DP3
);
x
=
padd
(
x
,
xmm1
);
x
=
padd
(
x
,
xmm2
);
x
=
padd
(
x
,
xmm3
);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y
=
p4f_coscof_p0
;
Packet4f
z
=
_mm_mul_ps
(
x
,
x
);
y
=
pmadd
(
y
,
z
,
p4f_coscof_p1
);
y
=
pmadd
(
y
,
z
,
p4f_coscof_p2
);
y
=
pmul
(
y
,
z
);
y
=
pmul
(
y
,
z
);
Packet4f
tmp
=
pmul
(
z
,
p4f_half
);
y
=
psub
(
y
,
tmp
);
y
=
padd
(
y
,
p4f_1
);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
Packet4f
y2
=
p4f_sincof_p0
;
y2
=
pmadd
(
y2
,
z
,
p4f_sincof_p1
);
y2
=
pmadd
(
y2
,
z
,
p4f_sincof_p2
);
y2
=
pmul
(
y2
,
z
);
y2
=
pmul
(
y2
,
x
);
y2
=
padd
(
y2
,
x
);
/* select the correct result from the two polynoms */
y2
=
_mm_and_ps
(
poly_mask
,
y2
);
y
=
_mm_andnot_ps
(
poly_mask
,
y
);
y
=
_mm_or_ps
(
y
,
y2
);
/* update the sign */
return
_mm_xor_ps
(
y
,
sign_bit
);
}
/* almost the same as psin */
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
pcos
<
Packet4f
>
(
const
Packet4f
&
_x
)
{
Packet4f
x
=
_x
;
_EIGEN_DECLARE_CONST_Packet4f
(
1
,
1.0
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
half
,
0.5
f
);
_EIGEN_DECLARE_CONST_Packet4i
(
1
,
1
);
_EIGEN_DECLARE_CONST_Packet4i
(
not1
,
~
1
);
_EIGEN_DECLARE_CONST_Packet4i
(
2
,
2
);
_EIGEN_DECLARE_CONST_Packet4i
(
4
,
4
);
_EIGEN_DECLARE_CONST_Packet4f
(
minus_cephes_DP1
,
-
0.78515625
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
minus_cephes_DP2
,
-
2.4187564849853515625e-4
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
minus_cephes_DP3
,
-
3.77489497744594108e-8
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
sincof_p0
,
-
1.9515295891E-4
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
sincof_p1
,
8.3321608736E-3
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
sincof_p2
,
-
1.6666654611E-1
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
coscof_p0
,
2.443315711809948E-005
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
coscof_p1
,
-
1.388731625493765E-003
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
coscof_p2
,
4.166664568298827E-002
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_FOPI
,
1.27323954473516
f
);
// 4 / M_PI
Packet4f
xmm1
,
xmm2
=
_mm_setzero_ps
(),
xmm3
,
y
;
Packet4i
emm0
,
emm2
;
x
=
pabs
(
x
);
/* scale by 4/Pi */
y
=
pmul
(
x
,
p4f_cephes_FOPI
);
/* get the integer part of y */
emm2
=
_mm_cvttps_epi32
(
y
);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2
=
_mm_add_epi32
(
emm2
,
p4i_1
);
emm2
=
_mm_and_si128
(
emm2
,
p4i_not1
);
y
=
_mm_cvtepi32_ps
(
emm2
);
emm2
=
_mm_sub_epi32
(
emm2
,
p4i_2
);
/* get the swap sign flag */
emm0
=
_mm_andnot_si128
(
emm2
,
p4i_4
);
emm0
=
_mm_slli_epi32
(
emm0
,
29
);
/* get the polynom selection mask */
emm2
=
_mm_and_si128
(
emm2
,
p4i_2
);
emm2
=
_mm_cmpeq_epi32
(
emm2
,
_mm_setzero_si128
());
Packet4f
sign_bit
=
_mm_castsi128_ps
(
emm0
);
Packet4f
poly_mask
=
_mm_castsi128_ps
(
emm2
);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1
=
pmul
(
y
,
p4f_minus_cephes_DP1
);
xmm2
=
pmul
(
y
,
p4f_minus_cephes_DP2
);
xmm3
=
pmul
(
y
,
p4f_minus_cephes_DP3
);
x
=
padd
(
x
,
xmm1
);
x
=
padd
(
x
,
xmm2
);
x
=
padd
(
x
,
xmm3
);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y
=
p4f_coscof_p0
;
Packet4f
z
=
pmul
(
x
,
x
);
y
=
pmadd
(
y
,
z
,
p4f_coscof_p1
);
y
=
pmadd
(
y
,
z
,
p4f_coscof_p2
);
y
=
pmul
(
y
,
z
);
y
=
pmul
(
y
,
z
);
Packet4f
tmp
=
_mm_mul_ps
(
z
,
p4f_half
);
y
=
psub
(
y
,
tmp
);
y
=
padd
(
y
,
p4f_1
);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
Packet4f
y2
=
p4f_sincof_p0
;
y2
=
pmadd
(
y2
,
z
,
p4f_sincof_p1
);
y2
=
pmadd
(
y2
,
z
,
p4f_sincof_p2
);
y2
=
pmul
(
y2
,
z
);
y2
=
pmadd
(
y2
,
x
,
x
);
/* select the correct result from the two polynoms */
y2
=
_mm_and_ps
(
poly_mask
,
y2
);
y
=
_mm_andnot_ps
(
poly_mask
,
y
);
y
=
_mm_or_ps
(
y
,
y2
);
/* update the sign */
return
_mm_xor_ps
(
y
,
sign_bit
);
}
// This is based on Quake3's fast inverse square root.
// For detail see here: http://www.beyond3d.com/content/articles/8/
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
psqrt
<
Packet4f
>
(
const
Packet4f
&
_x
)
{
Packet4f
half
=
pmul
(
_x
,
pset1
<
Packet4f
>
(
.5
f
));
/* select only the inverse sqrt of non-zero inputs */
Packet4f
non_zero_mask
=
_mm_cmpgt_ps
(
_x
,
pset1
<
Packet4f
>
(
std
::
numeric_limits
<
float
>::
epsilon
()));
Packet4f
x
=
_mm_and_ps
(
non_zero_mask
,
_mm_rsqrt_ps
(
_x
));
x
=
pmul
(
x
,
psub
(
pset1
<
Packet4f
>
(
1.5
f
),
pmul
(
half
,
pmul
(
x
,
x
))));
return
pmul
(
_x
,
x
);
}
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_SSE_H
densecrf/include/Eigen/src/Core/arch/SSE/PacketMath.h
0 → 100644
View file @
da2c1226
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_SSE_H
#define EIGEN_PACKET_MATH_SSE_H
namespace
Eigen
{
namespace
internal
{
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif
typedef
__m128
Packet4f
;
typedef
__m128i
Packet4i
;
typedef
__m128d
Packet2d
;
template
<
>
struct
is_arithmetic
<
__m128
>
{
enum
{
value
=
true
};
};
template
<
>
struct
is_arithmetic
<
__m128i
>
{
enum
{
value
=
true
};
};
template
<
>
struct
is_arithmetic
<
__m128d
>
{
enum
{
value
=
true
};
};
#define vec4f_swizzle1(v,p,q,r,s) \
(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
#define vec4i_swizzle1(v,p,q,r,s) \
(_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
#define vec2d_swizzle1(v,p,q) \
(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
#define vec4f_swizzle2(a,b,p,q,r,s) \
(_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
#define vec4i_swizzle2(a,b,p,q,r,s) \
(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f p4f_##NAME = pset1<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
template
<
>
struct
packet_traits
<
float
>
:
default_packet_traits
{
typedef
Packet4f
type
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
,
HasDiv
=
1
,
HasSin
=
EIGEN_FAST_MATH
,
HasCos
=
EIGEN_FAST_MATH
,
HasLog
=
1
,
HasExp
=
1
,
HasSqrt
=
1
};
};
template
<
>
struct
packet_traits
<
double
>
:
default_packet_traits
{
typedef
Packet2d
type
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasDiv
=
1
};
};
template
<
>
struct
packet_traits
<
int
>
:
default_packet_traits
{
typedef
Packet4i
type
;
enum
{
// FIXME check the Has*
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
};
};
template
<
>
struct
unpacket_traits
<
Packet4f
>
{
typedef
float
type
;
enum
{
size
=
4
};
};
template
<
>
struct
unpacket_traits
<
Packet2d
>
{
typedef
double
type
;
enum
{
size
=
2
};
};
template
<
>
struct
unpacket_traits
<
Packet4i
>
{
typedef
int
type
;
enum
{
size
=
4
};
};
#if defined(_MSC_VER) && (_MSC_VER==1500)
// Workaround MSVC 9 internal compiler error.
// TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
// TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)).
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pset1
<
Packet4f
>
(
const
float
&
from
)
{
return
_mm_set_ps
(
from
,
from
,
from
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pset1
<
Packet2d
>
(
const
double
&
from
)
{
return
_mm_set_pd
(
from
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pset1
<
Packet4i
>
(
const
int
&
from
)
{
return
_mm_set_epi32
(
from
,
from
,
from
,
from
);
}
#else
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pset1
<
Packet4f
>
(
const
float
&
from
)
{
return
_mm_set1_ps
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pset1
<
Packet2d
>
(
const
double
&
from
)
{
return
_mm_set1_pd
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pset1
<
Packet4i
>
(
const
int
&
from
)
{
return
_mm_set1_epi32
(
from
);
}
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet4f
plset
<
float
>
(
const
float
&
a
)
{
return
_mm_add_ps
(
pset1
<
Packet4f
>
(
a
),
_mm_set_ps
(
3
,
2
,
1
,
0
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
plset
<
double
>
(
const
double
&
a
)
{
return
_mm_add_pd
(
pset1
<
Packet2d
>
(
a
),
_mm_set_pd
(
1
,
0
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
plset
<
int
>
(
const
int
&
a
)
{
return
_mm_add_epi32
(
pset1
<
Packet4i
>
(
a
),
_mm_set_epi32
(
3
,
2
,
1
,
0
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
padd
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
_mm_add_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
padd
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
_mm_add_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
padd
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
_mm_add_epi32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
psub
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
_mm_sub_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
psub
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
_mm_sub_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
psub
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
_mm_sub_epi32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pnegate
(
const
Packet4f
&
a
)
{
const
Packet4f
mask
=
_mm_castsi128_ps
(
_mm_setr_epi32
(
0x80000000
,
0x80000000
,
0x80000000
,
0x80000000
));
return
_mm_xor_ps
(
a
,
mask
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pnegate
(
const
Packet2d
&
a
)
{
const
Packet2d
mask
=
_mm_castsi128_pd
(
_mm_setr_epi32
(
0x0
,
0x80000000
,
0x0
,
0x80000000
));
return
_mm_xor_pd
(
a
,
mask
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pnegate
(
const
Packet4i
&
a
)
{
return
psub
(
_mm_setr_epi32
(
0
,
0
,
0
,
0
),
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmul
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
_mm_mul_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmul
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
_mm_mul_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmul
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
#ifdef EIGEN_VECTORIZE_SSE4_1
return
_mm_mullo_epi32
(
a
,
b
);
#else
// this version is slightly faster than 4 scalar products
return
vec4i_swizzle1
(
vec4i_swizzle2
(
_mm_mul_epu32
(
a
,
b
),
_mm_mul_epu32
(
vec4i_swizzle1
(
a
,
1
,
0
,
3
,
2
),
vec4i_swizzle1
(
b
,
1
,
0
,
3
,
2
)),
0
,
2
,
0
,
2
),
0
,
2
,
1
,
3
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pdiv
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
_mm_div_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pdiv
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
_mm_div_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pdiv
<
Packet4i
>
(
const
Packet4i
&
/*a*/
,
const
Packet4i
&
/*b*/
)
{
eigen_assert
(
false
&&
"packet integer division are not supported by SSE"
);
return
pset1
<
Packet4i
>
(
0
);
}
// for some weird raisons, it has to be overloaded for packet of integers
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmadd
(
const
Packet4i
&
a
,
const
Packet4i
&
b
,
const
Packet4i
&
c
)
{
return
padd
(
pmul
(
a
,
b
),
c
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmin
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
_mm_min_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmin
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
_mm_min_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmin
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
// after some bench, this version *is* faster than a scalar implementation
Packet4i
mask
=
_mm_cmplt_epi32
(
a
,
b
);
return
_mm_or_si128
(
_mm_and_si128
(
mask
,
a
),
_mm_andnot_si128
(
mask
,
b
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmax
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
_mm_max_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmax
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
_mm_max_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmax
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
// after some bench, this version *is* faster than a scalar implementation
Packet4i
mask
=
_mm_cmpgt_epi32
(
a
,
b
);
return
_mm_or_si128
(
_mm_and_si128
(
mask
,
a
),
_mm_andnot_si128
(
mask
,
b
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pand
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
_mm_and_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pand
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
_mm_and_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pand
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
_mm_and_si128
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
por
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
_mm_or_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
por
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
_mm_or_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
por
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
_mm_or_si128
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pxor
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
_mm_xor_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pxor
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
_mm_xor_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pxor
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
_mm_xor_si128
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pandnot
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
_mm_andnot_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pandnot
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
_mm_andnot_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pandnot
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
_mm_andnot_si128
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pload
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
_mm_load_ps
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pload
<
Packet2d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
_mm_load_pd
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pload
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
_mm_load_si128
(
reinterpret_cast
<
const
Packet4i
*>
(
from
));
}
#if defined(_MSC_VER)
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploadu
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
#if (_MSC_VER==1600)
// NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
// (i.e., it does not generate an unaligned load!!
// TODO On most architectures this version should also be faster than a single _mm_loadu_ps
// so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so...
__m128
res
=
_mm_loadl_pi
(
_mm_set1_ps
(
0.0
f
),
(
const
__m64
*
)(
from
));
res
=
_mm_loadh_pi
(
res
,
(
const
__m64
*
)(
from
+
2
));
return
res
;
#else
return
_mm_loadu_ps
(
from
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
ploadu
<
Packet2d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
_mm_loadu_pd
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploadu
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
_mm_loadu_si128
(
reinterpret_cast
<
const
Packet4i
*>
(
from
));
}
#else
// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
// require pointer casting to incompatible pointer types and leads to invalid code
// because of the strict aliasing rule. The "dummy" stuff are required to enforce
// a correct instruction dependency.
// TODO: do the same for MSVC (ICC is compatible)
// NOTE: with the code below, MSVC's compiler crashes!
#if defined(__GNUC__) && defined(__i386__)
// bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
#elif defined(__clang__)
// bug 201: Segfaults in __mm_loadh_pd with clang 2.8
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
#else
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploadu
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
return
_mm_loadu_ps
(
from
);
#else
__m128d
res
;
res
=
_mm_load_sd
((
const
double
*
)(
from
))
;
res
=
_mm_loadh_pd
(
res
,
(
const
double
*
)(
from
+
2
))
;
return
_mm_castpd_ps
(
res
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
ploadu
<
Packet2d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
return
_mm_loadu_pd
(
from
);
#else
__m128d
res
;
res
=
_mm_load_sd
(
from
)
;
res
=
_mm_loadh_pd
(
res
,
from
+
1
);
return
res
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploadu
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
return
_mm_loadu_si128
(
reinterpret_cast
<
const
Packet4i
*>
(
from
));
#else
__m128d
res
;
res
=
_mm_load_sd
((
const
double
*
)(
from
))
;
res
=
_mm_loadh_pd
(
res
,
(
const
double
*
)(
from
+
2
))
;
return
_mm_castpd_si128
(
res
);
#endif
}
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploaddup
<
Packet4f
>
(
const
float
*
from
)
{
return
vec4f_swizzle1
(
_mm_castpd_ps
(
_mm_load_sd
(
reinterpret_cast
<
const
double
*>
(
from
))),
0
,
0
,
1
,
1
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
ploaddup
<
Packet2d
>
(
const
double
*
from
)
{
return
pset1
<
Packet2d
>
(
from
[
0
]);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploaddup
<
Packet4i
>
(
const
int
*
from
)
{
Packet4i
tmp
;
tmp
=
_mm_loadl_epi64
(
reinterpret_cast
<
const
Packet4i
*>
(
from
));
return
vec4i_swizzle1
(
tmp
,
0
,
0
,
1
,
1
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
_mm_store_ps
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
double
>
(
double
*
to
,
const
Packet2d
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
_mm_store_pd
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
_mm_store_si128
(
reinterpret_cast
<
Packet4i
*>
(
to
),
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
double
>
(
double
*
to
,
const
Packet2d
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
_mm_storel_pd
((
to
),
from
);
_mm_storeh_pd
((
to
+
1
),
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
(
reinterpret_cast
<
double
*>
(
to
),
_mm_castps_pd
(
from
));
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
(
reinterpret_cast
<
double
*>
(
to
),
_mm_castsi128_pd
(
from
));
}
// some compilers might be tempted to perform multiple moves instead of using a vector path.
template
<
>
EIGEN_STRONG_INLINE
void
pstore1
<
Packet4f
>
(
float
*
to
,
const
float
&
a
)
{
Packet4f
pa
=
_mm_set_ss
(
a
);
pstore
(
to
,
vec4f_swizzle1
(
pa
,
0
,
0
,
0
,
0
));
}
// some compilers might be tempted to perform multiple moves instead of using a vector path.
template
<
>
EIGEN_STRONG_INLINE
void
pstore1
<
Packet2d
>
(
double
*
to
,
const
double
&
a
)
{
Packet2d
pa
=
_mm_set_sd
(
a
);
pstore
(
to
,
vec2d_swizzle1
(
pa
,
0
,
0
));
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
float
>
(
const
float
*
addr
)
{
_mm_prefetch
((
const
char
*
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
double
>
(
const
double
*
addr
)
{
_mm_prefetch
((
const
char
*
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
int
>
(
const
int
*
addr
)
{
_mm_prefetch
((
const
char
*
)(
addr
),
_MM_HINT_T0
);
}
#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
// Direct of the struct members fixed bug #62.
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet4f
>
(
const
Packet4f
&
a
)
{
return
a
.
m128_f32
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
double
pfirst
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
a
.
m128d_f64
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
int
pfirst
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int
x
=
_mm_cvtsi128_si32
(
a
);
return
x
;
}
#elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float
x
=
_mm_cvtss_f32
(
a
);
return
x
;
}
template
<
>
EIGEN_STRONG_INLINE
double
pfirst
<
Packet2d
>
(
const
Packet2d
&
a
)
{
double
x
=
_mm_cvtsd_f64
(
a
);
return
x
;
}
template
<
>
EIGEN_STRONG_INLINE
int
pfirst
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int
x
=
_mm_cvtsi128_si32
(
a
);
return
x
;
}
#else
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet4f
>
(
const
Packet4f
&
a
)
{
return
_mm_cvtss_f32
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
double
pfirst
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
_mm_cvtsd_f64
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
int
pfirst
<
Packet4i
>
(
const
Packet4i
&
a
)
{
return
_mm_cvtsi128_si32
(
a
);
}
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet4f
preverse
(
const
Packet4f
&
a
)
{
return
_mm_shuffle_ps
(
a
,
a
,
0x1B
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
preverse
(
const
Packet2d
&
a
)
{
return
_mm_shuffle_pd
(
a
,
a
,
0x1
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
preverse
(
const
Packet4i
&
a
)
{
return
_mm_shuffle_epi32
(
a
,
0x1B
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pabs
(
const
Packet4f
&
a
)
{
const
Packet4f
mask
=
_mm_castsi128_ps
(
_mm_setr_epi32
(
0x7FFFFFFF
,
0x7FFFFFFF
,
0x7FFFFFFF
,
0x7FFFFFFF
));
return
_mm_and_ps
(
a
,
mask
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pabs
(
const
Packet2d
&
a
)
{
const
Packet2d
mask
=
_mm_castsi128_pd
(
_mm_setr_epi32
(
0xFFFFFFFF
,
0x7FFFFFFF
,
0xFFFFFFFF
,
0x7FFFFFFF
));
return
_mm_and_pd
(
a
,
mask
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pabs
(
const
Packet4i
&
a
)
{
#ifdef EIGEN_VECTORIZE_SSSE3
return
_mm_abs_epi32
(
a
);
#else
Packet4i
aux
=
_mm_srai_epi32
(
a
,
31
);
return
_mm_sub_epi32
(
_mm_xor_si128
(
a
,
aux
),
aux
);
#endif
}
EIGEN_STRONG_INLINE
void
punpackp
(
Packet4f
*
vecs
)
{
vecs
[
1
]
=
_mm_castsi128_ps
(
_mm_shuffle_epi32
(
_mm_castps_si128
(
vecs
[
0
]),
0x55
));
vecs
[
2
]
=
_mm_castsi128_ps
(
_mm_shuffle_epi32
(
_mm_castps_si128
(
vecs
[
0
]),
0xAA
));
vecs
[
3
]
=
_mm_castsi128_ps
(
_mm_shuffle_epi32
(
_mm_castps_si128
(
vecs
[
0
]),
0xFF
));
vecs
[
0
]
=
_mm_castsi128_ps
(
_mm_shuffle_epi32
(
_mm_castps_si128
(
vecs
[
0
]),
0x00
));
}
#ifdef EIGEN_VECTORIZE_SSE3
// TODO implement SSE2 versions as well as integer versions
template
<
>
EIGEN_STRONG_INLINE
Packet4f
preduxp
<
Packet4f
>
(
const
Packet4f
*
vecs
)
{
return
_mm_hadd_ps
(
_mm_hadd_ps
(
vecs
[
0
],
vecs
[
1
]),
_mm_hadd_ps
(
vecs
[
2
],
vecs
[
3
]));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
preduxp
<
Packet2d
>
(
const
Packet2d
*
vecs
)
{
return
_mm_hadd_pd
(
vecs
[
0
],
vecs
[
1
]);
}
// SSSE3 version:
// EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs)
// {
// return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
// }
template
<
>
EIGEN_STRONG_INLINE
float
predux
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet4f
tmp0
=
_mm_hadd_ps
(
a
,
a
);
return
pfirst
(
_mm_hadd_ps
(
tmp0
,
tmp0
));
}
template
<
>
EIGEN_STRONG_INLINE
double
predux
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
pfirst
(
_mm_hadd_pd
(
a
,
a
));
}
// SSSE3 version:
// EIGEN_STRONG_INLINE float predux(const Packet4i& a)
// {
// Packet4i tmp0 = _mm_hadd_epi32(a,a);
// return pfirst(_mm_hadd_epi32(tmp0, tmp0));
// }
#else
// SSE2 versions
template
<
>
EIGEN_STRONG_INLINE
float
predux
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet4f
tmp
=
_mm_add_ps
(
a
,
_mm_movehl_ps
(
a
,
a
));
return
pfirst
(
_mm_add_ss
(
tmp
,
_mm_shuffle_ps
(
tmp
,
tmp
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
double
predux
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
pfirst
(
_mm_add_sd
(
a
,
_mm_unpackhi_pd
(
a
,
a
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
preduxp
<
Packet4f
>
(
const
Packet4f
*
vecs
)
{
Packet4f
tmp0
,
tmp1
,
tmp2
;
tmp0
=
_mm_unpacklo_ps
(
vecs
[
0
],
vecs
[
1
]);
tmp1
=
_mm_unpackhi_ps
(
vecs
[
0
],
vecs
[
1
]);
tmp2
=
_mm_unpackhi_ps
(
vecs
[
2
],
vecs
[
3
]);
tmp0
=
_mm_add_ps
(
tmp0
,
tmp1
);
tmp1
=
_mm_unpacklo_ps
(
vecs
[
2
],
vecs
[
3
]);
tmp1
=
_mm_add_ps
(
tmp1
,
tmp2
);
tmp2
=
_mm_movehl_ps
(
tmp1
,
tmp0
);
tmp0
=
_mm_movelh_ps
(
tmp0
,
tmp1
);
return
_mm_add_ps
(
tmp0
,
tmp2
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
preduxp
<
Packet2d
>
(
const
Packet2d
*
vecs
)
{
return
_mm_add_pd
(
_mm_unpacklo_pd
(
vecs
[
0
],
vecs
[
1
]),
_mm_unpackhi_pd
(
vecs
[
0
],
vecs
[
1
]));
}
#endif // SSE3
template
<
>
EIGEN_STRONG_INLINE
int
predux
<
Packet4i
>
(
const
Packet4i
&
a
)
{
Packet4i
tmp
=
_mm_add_epi32
(
a
,
_mm_unpackhi_epi64
(
a
,
a
));
return
pfirst
(
tmp
)
+
pfirst
(
_mm_shuffle_epi32
(
tmp
,
1
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
preduxp
<
Packet4i
>
(
const
Packet4i
*
vecs
)
{
Packet4i
tmp0
,
tmp1
,
tmp2
;
tmp0
=
_mm_unpacklo_epi32
(
vecs
[
0
],
vecs
[
1
]);
tmp1
=
_mm_unpackhi_epi32
(
vecs
[
0
],
vecs
[
1
]);
tmp2
=
_mm_unpackhi_epi32
(
vecs
[
2
],
vecs
[
3
]);
tmp0
=
_mm_add_epi32
(
tmp0
,
tmp1
);
tmp1
=
_mm_unpacklo_epi32
(
vecs
[
2
],
vecs
[
3
]);
tmp1
=
_mm_add_epi32
(
tmp1
,
tmp2
);
tmp2
=
_mm_unpacklo_epi64
(
tmp0
,
tmp1
);
tmp0
=
_mm_unpackhi_epi64
(
tmp0
,
tmp1
);
return
_mm_add_epi32
(
tmp0
,
tmp2
);
}
// Other reduction functions:
// mul
template
<
>
EIGEN_STRONG_INLINE
float
predux_mul
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet4f
tmp
=
_mm_mul_ps
(
a
,
_mm_movehl_ps
(
a
,
a
));
return
pfirst
(
_mm_mul_ss
(
tmp
,
_mm_shuffle_ps
(
tmp
,
tmp
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
double
predux_mul
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
pfirst
(
_mm_mul_sd
(
a
,
_mm_unpackhi_pd
(
a
,
a
)));
}
template
<
>
EIGEN_STRONG_INLINE
int
predux_mul
<
Packet4i
>
(
const
Packet4i
&
a
)
{
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., reusing pmul is very slow !)
// TODO try to call _mm_mul_epu32 directly
EIGEN_ALIGN16
int
aux
[
4
];
pstore
(
aux
,
a
);
return
(
aux
[
0
]
*
aux
[
1
])
*
(
aux
[
2
]
*
aux
[
3
]);;
}
// min
template
<
>
EIGEN_STRONG_INLINE
float
predux_min
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet4f
tmp
=
_mm_min_ps
(
a
,
_mm_movehl_ps
(
a
,
a
));
return
pfirst
(
_mm_min_ss
(
tmp
,
_mm_shuffle_ps
(
tmp
,
tmp
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
double
predux_min
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
pfirst
(
_mm_min_sd
(
a
,
_mm_unpackhi_pd
(
a
,
a
)));
}
template
<
>
EIGEN_STRONG_INLINE
int
predux_min
<
Packet4i
>
(
const
Packet4i
&
a
)
{
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!)
EIGEN_ALIGN16
int
aux
[
4
];
pstore
(
aux
,
a
);
register
int
aux0
=
aux
[
0
]
<
aux
[
1
]
?
aux
[
0
]
:
aux
[
1
];
register
int
aux2
=
aux
[
2
]
<
aux
[
3
]
?
aux
[
2
]
:
aux
[
3
];
return
aux0
<
aux2
?
aux0
:
aux2
;
}
// max
template
<
>
EIGEN_STRONG_INLINE
float
predux_max
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet4f
tmp
=
_mm_max_ps
(
a
,
_mm_movehl_ps
(
a
,
a
));
return
pfirst
(
_mm_max_ss
(
tmp
,
_mm_shuffle_ps
(
tmp
,
tmp
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
double
predux_max
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
pfirst
(
_mm_max_sd
(
a
,
_mm_unpackhi_pd
(
a
,
a
)));
}
template
<
>
EIGEN_STRONG_INLINE
int
predux_max
<
Packet4i
>
(
const
Packet4i
&
a
)
{
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!)
EIGEN_ALIGN16
int
aux
[
4
];
pstore
(
aux
,
a
);
register
int
aux0
=
aux
[
0
]
>
aux
[
1
]
?
aux
[
0
]
:
aux
[
1
];
register
int
aux2
=
aux
[
2
]
>
aux
[
3
]
?
aux
[
2
]
:
aux
[
3
];
return
aux0
>
aux2
?
aux0
:
aux2
;
}
#if (defined __GNUC__)
// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
// {
// Packet4f res = b;
// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
// return res;
// }
// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i)
// {
// Packet4i res = a;
// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
// return res;
// }
#endif
#ifdef EIGEN_VECTORIZE_SSSE3
// SSSE3 versions
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet4f
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet4f
&
first
,
const
Packet4f
&
second
)
{
if
(
Offset
!=
0
)
first
=
_mm_castsi128_ps
(
_mm_alignr_epi8
(
_mm_castps_si128
(
second
),
_mm_castps_si128
(
first
),
Offset
*
4
));
}
};
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet4i
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet4i
&
first
,
const
Packet4i
&
second
)
{
if
(
Offset
!=
0
)
first
=
_mm_alignr_epi8
(
second
,
first
,
Offset
*
4
);
}
};
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet2d
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet2d
&
first
,
const
Packet2d
&
second
)
{
if
(
Offset
==
1
)
first
=
_mm_castsi128_pd
(
_mm_alignr_epi8
(
_mm_castpd_si128
(
second
),
_mm_castpd_si128
(
first
),
8
));
}
};
#else
// SSE2 versions
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet4f
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet4f
&
first
,
const
Packet4f
&
second
)
{
if
(
Offset
==
1
)
{
first
=
_mm_move_ss
(
first
,
second
);
first
=
_mm_castsi128_ps
(
_mm_shuffle_epi32
(
_mm_castps_si128
(
first
),
0x39
));
}
else
if
(
Offset
==
2
)
{
first
=
_mm_movehl_ps
(
first
,
first
);
first
=
_mm_movelh_ps
(
first
,
second
);
}
else
if
(
Offset
==
3
)
{
first
=
_mm_move_ss
(
first
,
second
);
first
=
_mm_shuffle_ps
(
first
,
second
,
0x93
);
}
}
};
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet4i
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet4i
&
first
,
const
Packet4i
&
second
)
{
if
(
Offset
==
1
)
{
first
=
_mm_castps_si128
(
_mm_move_ss
(
_mm_castsi128_ps
(
first
),
_mm_castsi128_ps
(
second
)));
first
=
_mm_shuffle_epi32
(
first
,
0x39
);
}
else
if
(
Offset
==
2
)
{
first
=
_mm_castps_si128
(
_mm_movehl_ps
(
_mm_castsi128_ps
(
first
),
_mm_castsi128_ps
(
first
)));
first
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
first
),
_mm_castsi128_ps
(
second
)));
}
else
if
(
Offset
==
3
)
{
first
=
_mm_castps_si128
(
_mm_move_ss
(
_mm_castsi128_ps
(
first
),
_mm_castsi128_ps
(
second
)));
first
=
_mm_castps_si128
(
_mm_shuffle_ps
(
_mm_castsi128_ps
(
first
),
_mm_castsi128_ps
(
second
),
0x93
));
}
}
};
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet2d
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet2d
&
first
,
const
Packet2d
&
second
)
{
if
(
Offset
==
1
)
{
first
=
_mm_castps_pd
(
_mm_movehl_ps
(
_mm_castpd_ps
(
first
),
_mm_castpd_ps
(
first
)));
first
=
_mm_castps_pd
(
_mm_movelh_ps
(
_mm_castpd_ps
(
first
),
_mm_castpd_ps
(
second
)));
}
}
};
#endif
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_PACKET_MATH_SSE_H
densecrf/include/Eigen/src/Core/products/CMakeLists.txt
0 → 100644
View file @
da2c1226
FILE
(
GLOB Eigen_Core_Product_SRCS
"*.h"
)
INSTALL
(
FILES
${
Eigen_Core_Product_SRCS
}
DESTINATION
${
INCLUDE_INSTALL_DIR
}
/Eigen/src/Core/products COMPONENT Devel
)
densecrf/include/Eigen/src/Core/products/CoeffBasedProduct.h
0 → 100644
View file @
da2c1226
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COEFFBASED_PRODUCT_H
#define EIGEN_COEFFBASED_PRODUCT_H
namespace
Eigen
{
namespace
internal
{
/*********************************************************************************
* Coefficient based product implementation.
* It is designed for the following use cases:
* - small fixed sizes
* - lazy products
*********************************************************************************/
/* Since the all the dimensions of the product are small, here we can rely
* on the generic Assign mechanism to evaluate the product per coeff (or packet).
*
* Note that here the inner-loops should always be unrolled.
*/
template
<
int
Traversal
,
int
UnrollingIndex
,
typename
Lhs
,
typename
Rhs
,
typename
RetScalar
>
struct
product_coeff_impl
;
template
<
int
StorageOrder
,
int
UnrollingIndex
,
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
;
template
<
typename
LhsNested
,
typename
RhsNested
,
int
NestingFlags
>
struct
traits
<
CoeffBasedProduct
<
LhsNested
,
RhsNested
,
NestingFlags
>
>
{
typedef
MatrixXpr
XprKind
;
typedef
typename
remove_all
<
LhsNested
>::
type
_LhsNested
;
typedef
typename
remove_all
<
RhsNested
>::
type
_RhsNested
;
typedef
typename
scalar_product_traits
<
typename
_LhsNested
::
Scalar
,
typename
_RhsNested
::
Scalar
>::
ReturnType
Scalar
;
typedef
typename
promote_storage_type
<
typename
traits
<
_LhsNested
>::
StorageKind
,
typename
traits
<
_RhsNested
>::
StorageKind
>::
ret
StorageKind
;
typedef
typename
promote_index_type
<
typename
traits
<
_LhsNested
>::
Index
,
typename
traits
<
_RhsNested
>::
Index
>::
type
Index
;
enum
{
LhsCoeffReadCost
=
_LhsNested
::
CoeffReadCost
,
RhsCoeffReadCost
=
_RhsNested
::
CoeffReadCost
,
LhsFlags
=
_LhsNested
::
Flags
,
RhsFlags
=
_RhsNested
::
Flags
,
RowsAtCompileTime
=
_LhsNested
::
RowsAtCompileTime
,
ColsAtCompileTime
=
_RhsNested
::
ColsAtCompileTime
,
InnerSize
=
EIGEN_SIZE_MIN_PREFER_FIXED
(
_LhsNested
::
ColsAtCompileTime
,
_RhsNested
::
RowsAtCompileTime
),
MaxRowsAtCompileTime
=
_LhsNested
::
MaxRowsAtCompileTime
,
MaxColsAtCompileTime
=
_RhsNested
::
MaxColsAtCompileTime
,
LhsRowMajor
=
LhsFlags
&
RowMajorBit
,
RhsRowMajor
=
RhsFlags
&
RowMajorBit
,
SameType
=
is_same
<
typename
_LhsNested
::
Scalar
,
typename
_RhsNested
::
Scalar
>::
value
,
CanVectorizeRhs
=
RhsRowMajor
&&
(
RhsFlags
&
PacketAccessBit
)
&&
(
ColsAtCompileTime
==
Dynamic
||
(
(
ColsAtCompileTime
%
packet_traits
<
Scalar
>::
size
)
==
0
&&
(
RhsFlags
&
AlignedBit
)
)
),
CanVectorizeLhs
=
(
!
LhsRowMajor
)
&&
(
LhsFlags
&
PacketAccessBit
)
&&
(
RowsAtCompileTime
==
Dynamic
||
(
(
RowsAtCompileTime
%
packet_traits
<
Scalar
>::
size
)
==
0
&&
(
LhsFlags
&
AlignedBit
)
)
),
EvalToRowMajor
=
(
MaxRowsAtCompileTime
==
1
&&
MaxColsAtCompileTime
!=
1
)
?
1
:
(
MaxColsAtCompileTime
==
1
&&
MaxRowsAtCompileTime
!=
1
)
?
0
:
(
RhsRowMajor
&&
!
CanVectorizeLhs
),
Flags
=
((
unsigned
int
)(
LhsFlags
|
RhsFlags
)
&
HereditaryBits
&
~
RowMajorBit
)
|
(
EvalToRowMajor
?
RowMajorBit
:
0
)
|
NestingFlags
|
(
LhsFlags
&
RhsFlags
&
AlignedBit
)
// TODO enable vectorization for mixed types
|
(
SameType
&&
(
CanVectorizeLhs
||
CanVectorizeRhs
)
?
PacketAccessBit
:
0
),
CoeffReadCost
=
InnerSize
==
Dynamic
?
Dynamic
:
InnerSize
*
(
NumTraits
<
Scalar
>::
MulCost
+
LhsCoeffReadCost
+
RhsCoeffReadCost
)
+
(
InnerSize
-
1
)
*
NumTraits
<
Scalar
>::
AddCost
,
/* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
* of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
* loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
* the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
*/
CanVectorizeInner
=
SameType
&&
LhsRowMajor
&&
(
!
RhsRowMajor
)
&&
(
LhsFlags
&
RhsFlags
&
ActualPacketAccessBit
)
&&
(
LhsFlags
&
RhsFlags
&
AlignedBit
)
&&
(
InnerSize
%
packet_traits
<
Scalar
>::
size
==
0
)
};
};
}
// end namespace internal
template
<
typename
LhsNested
,
typename
RhsNested
,
int
NestingFlags
>
class
CoeffBasedProduct
:
internal
::
no_assignment_operator
,
public
MatrixBase
<
CoeffBasedProduct
<
LhsNested
,
RhsNested
,
NestingFlags
>
>
{
public:
typedef
MatrixBase
<
CoeffBasedProduct
>
Base
;
EIGEN_DENSE_PUBLIC_INTERFACE
(
CoeffBasedProduct
)
typedef
typename
Base
::
PlainObject
PlainObject
;
private:
typedef
typename
internal
::
traits
<
CoeffBasedProduct
>::
_LhsNested
_LhsNested
;
typedef
typename
internal
::
traits
<
CoeffBasedProduct
>::
_RhsNested
_RhsNested
;
enum
{
PacketSize
=
internal
::
packet_traits
<
Scalar
>::
size
,
InnerSize
=
internal
::
traits
<
CoeffBasedProduct
>::
InnerSize
,
Unroll
=
CoeffReadCost
!=
Dynamic
&&
CoeffReadCost
<=
EIGEN_UNROLLING_LIMIT
,
CanVectorizeInner
=
internal
::
traits
<
CoeffBasedProduct
>::
CanVectorizeInner
};
typedef
internal
::
product_coeff_impl
<
CanVectorizeInner
?
InnerVectorizedTraversal
:
DefaultTraversal
,
Unroll
?
InnerSize
-
1
:
Dynamic
,
_LhsNested
,
_RhsNested
,
Scalar
>
ScalarCoeffImpl
;
typedef
CoeffBasedProduct
<
LhsNested
,
RhsNested
,
NestByRefBit
>
LazyCoeffBasedProductType
;
public:
inline
CoeffBasedProduct
(
const
CoeffBasedProduct
&
other
)
:
Base
(),
m_lhs
(
other
.
m_lhs
),
m_rhs
(
other
.
m_rhs
)
{}
template
<
typename
Lhs
,
typename
Rhs
>
inline
CoeffBasedProduct
(
const
Lhs
&
lhs
,
const
Rhs
&
rhs
)
:
m_lhs
(
lhs
),
m_rhs
(
rhs
)
{
// we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
// We still allow to mix T and complex<T>.
EIGEN_STATIC_ASSERT
((
internal
::
is_same
<
typename
Lhs
::
RealScalar
,
typename
Rhs
::
RealScalar
>::
value
),
YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY
)
eigen_assert
(
lhs
.
cols
()
==
rhs
.
rows
()
&&
"invalid matrix product"
&&
"if you wanted a coeff-wise or a dot product use the respective explicit functions"
);
}
EIGEN_STRONG_INLINE
Index
rows
()
const
{
return
m_lhs
.
rows
();
}
EIGEN_STRONG_INLINE
Index
cols
()
const
{
return
m_rhs
.
cols
();
}
EIGEN_STRONG_INLINE
const
Scalar
coeff
(
Index
row
,
Index
col
)
const
{
Scalar
res
;
ScalarCoeffImpl
::
run
(
row
,
col
,
m_lhs
,
m_rhs
,
res
);
return
res
;
}
/* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
* which is why we don't set the LinearAccessBit.
*/
EIGEN_STRONG_INLINE
const
Scalar
coeff
(
Index
index
)
const
{
Scalar
res
;
const
Index
row
=
RowsAtCompileTime
==
1
?
0
:
index
;
const
Index
col
=
RowsAtCompileTime
==
1
?
index
:
0
;
ScalarCoeffImpl
::
run
(
row
,
col
,
m_lhs
,
m_rhs
,
res
);
return
res
;
}
template
<
int
LoadMode
>
EIGEN_STRONG_INLINE
const
PacketScalar
packet
(
Index
row
,
Index
col
)
const
{
PacketScalar
res
;
internal
::
product_packet_impl
<
Flags
&
RowMajorBit
?
RowMajor
:
ColMajor
,
Unroll
?
InnerSize
-
1
:
Dynamic
,
_LhsNested
,
_RhsNested
,
PacketScalar
,
LoadMode
>
::
run
(
row
,
col
,
m_lhs
,
m_rhs
,
res
);
return
res
;
}
// Implicit conversion to the nested type (trigger the evaluation of the product)
EIGEN_STRONG_INLINE
operator
const
PlainObject
&
()
const
{
m_result
.
lazyAssign
(
*
this
);
return
m_result
;
}
const
_LhsNested
&
lhs
()
const
{
return
m_lhs
;
}
const
_RhsNested
&
rhs
()
const
{
return
m_rhs
;
}
const
Diagonal
<
const
LazyCoeffBasedProductType
,
0
>
diagonal
()
const
{
return
reinterpret_cast
<
const
LazyCoeffBasedProductType
&>
(
*
this
);
}
template
<
int
DiagonalIndex
>
const
Diagonal
<
const
LazyCoeffBasedProductType
,
DiagonalIndex
>
diagonal
()
const
{
return
reinterpret_cast
<
const
LazyCoeffBasedProductType
&>
(
*
this
);
}
const
Diagonal
<
const
LazyCoeffBasedProductType
,
Dynamic
>
diagonal
(
Index
index
)
const
{
return
reinterpret_cast
<
const
LazyCoeffBasedProductType
&>
(
*
this
).
diagonal
(
index
);
}
protected:
typename
internal
::
add_const_on_value_type
<
LhsNested
>::
type
m_lhs
;
typename
internal
::
add_const_on_value_type
<
RhsNested
>::
type
m_rhs
;
mutable
PlainObject
m_result
;
};
namespace
internal
{
// here we need to overload the nested rule for products
// such that the nested type is a const reference to a plain matrix
template
<
typename
Lhs
,
typename
Rhs
,
int
N
,
typename
PlainObject
>
struct
nested
<
CoeffBasedProduct
<
Lhs
,
Rhs
,
EvalBeforeNestingBit
|
EvalBeforeAssigningBit
>
,
N
,
PlainObject
>
{
typedef
PlainObject
const
&
type
;
};
/***************************************************************************
* Normal product .coeff() implementation (with meta-unrolling)
***************************************************************************/
/**************************************
*** Scalar path - no vectorization ***
**************************************/
template
<
int
UnrollingIndex
,
typename
Lhs
,
typename
Rhs
,
typename
RetScalar
>
struct
product_coeff_impl
<
DefaultTraversal
,
UnrollingIndex
,
Lhs
,
Rhs
,
RetScalar
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
RetScalar
&
res
)
{
product_coeff_impl
<
DefaultTraversal
,
UnrollingIndex
-
1
,
Lhs
,
Rhs
,
RetScalar
>::
run
(
row
,
col
,
lhs
,
rhs
,
res
);
res
+=
lhs
.
coeff
(
row
,
UnrollingIndex
)
*
rhs
.
coeff
(
UnrollingIndex
,
col
);
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
RetScalar
>
struct
product_coeff_impl
<
DefaultTraversal
,
0
,
Lhs
,
Rhs
,
RetScalar
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
RetScalar
&
res
)
{
res
=
lhs
.
coeff
(
row
,
0
)
*
rhs
.
coeff
(
0
,
col
);
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
RetScalar
>
struct
product_coeff_impl
<
DefaultTraversal
,
Dynamic
,
Lhs
,
Rhs
,
RetScalar
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
RetScalar
&
res
)
{
eigen_assert
(
lhs
.
cols
()
>
0
&&
"you are using a non initialized matrix"
);
res
=
lhs
.
coeff
(
row
,
0
)
*
rhs
.
coeff
(
0
,
col
);
for
(
Index
i
=
1
;
i
<
lhs
.
cols
();
++
i
)
res
+=
lhs
.
coeff
(
row
,
i
)
*
rhs
.
coeff
(
i
,
col
);
}
};
/*******************************************
*** Scalar path with inner vectorization ***
*******************************************/
template
<
int
UnrollingIndex
,
typename
Lhs
,
typename
Rhs
,
typename
Packet
>
struct
product_coeff_vectorized_unroller
{
typedef
typename
Lhs
::
Index
Index
;
enum
{
PacketSize
=
packet_traits
<
typename
Lhs
::
Scalar
>::
size
};
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
typename
Lhs
::
PacketScalar
&
pres
)
{
product_coeff_vectorized_unroller
<
UnrollingIndex
-
PacketSize
,
Lhs
,
Rhs
,
Packet
>::
run
(
row
,
col
,
lhs
,
rhs
,
pres
);
pres
=
padd
(
pres
,
pmul
(
lhs
.
template
packet
<
Aligned
>(
row
,
UnrollingIndex
)
,
rhs
.
template
packet
<
Aligned
>(
UnrollingIndex
,
col
)
));
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
Packet
>
struct
product_coeff_vectorized_unroller
<
0
,
Lhs
,
Rhs
,
Packet
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
typename
Lhs
::
PacketScalar
&
pres
)
{
pres
=
pmul
(
lhs
.
template
packet
<
Aligned
>(
row
,
0
)
,
rhs
.
template
packet
<
Aligned
>(
0
,
col
));
}
};
template
<
int
UnrollingIndex
,
typename
Lhs
,
typename
Rhs
,
typename
RetScalar
>
struct
product_coeff_impl
<
InnerVectorizedTraversal
,
UnrollingIndex
,
Lhs
,
Rhs
,
RetScalar
>
{
typedef
typename
Lhs
::
PacketScalar
Packet
;
typedef
typename
Lhs
::
Index
Index
;
enum
{
PacketSize
=
packet_traits
<
typename
Lhs
::
Scalar
>::
size
};
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
RetScalar
&
res
)
{
Packet
pres
;
product_coeff_vectorized_unroller
<
UnrollingIndex
+
1
-
PacketSize
,
Lhs
,
Rhs
,
Packet
>::
run
(
row
,
col
,
lhs
,
rhs
,
pres
);
product_coeff_impl
<
DefaultTraversal
,
UnrollingIndex
,
Lhs
,
Rhs
,
RetScalar
>::
run
(
row
,
col
,
lhs
,
rhs
,
res
);
res
=
predux
(
pres
);
}
};
template
<
typename
Lhs
,
typename
Rhs
,
int
LhsRows
=
Lhs
::
RowsAtCompileTime
,
int
RhsCols
=
Rhs
::
ColsAtCompileTime
>
struct
product_coeff_vectorized_dyn_selector
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
typename
Lhs
::
Scalar
&
res
)
{
res
=
lhs
.
row
(
row
).
transpose
().
cwiseProduct
(
rhs
.
col
(
col
)).
sum
();
}
};
// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
template
<
typename
Lhs
,
typename
Rhs
,
int
RhsCols
>
struct
product_coeff_vectorized_dyn_selector
<
Lhs
,
Rhs
,
1
,
RhsCols
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
/*row*/
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
typename
Lhs
::
Scalar
&
res
)
{
res
=
lhs
.
transpose
().
cwiseProduct
(
rhs
.
col
(
col
)).
sum
();
}
};
template
<
typename
Lhs
,
typename
Rhs
,
int
LhsRows
>
struct
product_coeff_vectorized_dyn_selector
<
Lhs
,
Rhs
,
LhsRows
,
1
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
/*col*/
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
typename
Lhs
::
Scalar
&
res
)
{
res
=
lhs
.
row
(
row
).
transpose
().
cwiseProduct
(
rhs
).
sum
();
}
};
template
<
typename
Lhs
,
typename
Rhs
>
struct
product_coeff_vectorized_dyn_selector
<
Lhs
,
Rhs
,
1
,
1
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
/*row*/
,
Index
/*col*/
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
typename
Lhs
::
Scalar
&
res
)
{
res
=
lhs
.
transpose
().
cwiseProduct
(
rhs
).
sum
();
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
RetScalar
>
struct
product_coeff_impl
<
InnerVectorizedTraversal
,
Dynamic
,
Lhs
,
Rhs
,
RetScalar
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
typename
Lhs
::
Scalar
&
res
)
{
product_coeff_vectorized_dyn_selector
<
Lhs
,
Rhs
>::
run
(
row
,
col
,
lhs
,
rhs
,
res
);
}
};
/*******************
*** Packet path ***
*******************/
template
<
int
UnrollingIndex
,
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
<
RowMajor
,
UnrollingIndex
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Packet
&
res
)
{
product_packet_impl
<
RowMajor
,
UnrollingIndex
-
1
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>::
run
(
row
,
col
,
lhs
,
rhs
,
res
);
res
=
pmadd
(
pset1
<
Packet
>
(
lhs
.
coeff
(
row
,
UnrollingIndex
)),
rhs
.
template
packet
<
LoadMode
>(
UnrollingIndex
,
col
),
res
);
}
};
template
<
int
UnrollingIndex
,
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
<
ColMajor
,
UnrollingIndex
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Packet
&
res
)
{
product_packet_impl
<
ColMajor
,
UnrollingIndex
-
1
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>::
run
(
row
,
col
,
lhs
,
rhs
,
res
);
res
=
pmadd
(
lhs
.
template
packet
<
LoadMode
>(
row
,
UnrollingIndex
),
pset1
<
Packet
>
(
rhs
.
coeff
(
UnrollingIndex
,
col
)),
res
);
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
<
RowMajor
,
0
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Packet
&
res
)
{
res
=
pmul
(
pset1
<
Packet
>
(
lhs
.
coeff
(
row
,
0
)),
rhs
.
template
packet
<
LoadMode
>(
0
,
col
));
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
<
ColMajor
,
0
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Packet
&
res
)
{
res
=
pmul
(
lhs
.
template
packet
<
LoadMode
>(
row
,
0
),
pset1
<
Packet
>
(
rhs
.
coeff
(
0
,
col
)));
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
<
RowMajor
,
Dynamic
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Packet
&
res
)
{
eigen_assert
(
lhs
.
cols
()
>
0
&&
"you are using a non initialized matrix"
);
res
=
pmul
(
pset1
<
Packet
>
(
lhs
.
coeff
(
row
,
0
)),
rhs
.
template
packet
<
LoadMode
>(
0
,
col
));
for
(
Index
i
=
1
;
i
<
lhs
.
cols
();
++
i
)
res
=
pmadd
(
pset1
<
Packet
>
(
lhs
.
coeff
(
row
,
i
)),
rhs
.
template
packet
<
LoadMode
>(
i
,
col
),
res
);
}
};
template
<
typename
Lhs
,
typename
Rhs
,
typename
Packet
,
int
LoadMode
>
struct
product_packet_impl
<
ColMajor
,
Dynamic
,
Lhs
,
Rhs
,
Packet
,
LoadMode
>
{
typedef
typename
Lhs
::
Index
Index
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
row
,
Index
col
,
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Packet
&
res
)
{
eigen_assert
(
lhs
.
cols
()
>
0
&&
"you are using a non initialized matrix"
);
res
=
pmul
(
lhs
.
template
packet
<
LoadMode
>(
row
,
0
),
pset1
<
Packet
>
(
rhs
.
coeff
(
0
,
col
)));
for
(
Index
i
=
1
;
i
<
lhs
.
cols
();
++
i
)
res
=
pmadd
(
lhs
.
template
packet
<
LoadMode
>(
row
,
i
),
pset1
<
Packet
>
(
rhs
.
coeff
(
i
,
col
)),
res
);
}
};
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_COEFFBASED_PRODUCT_H
densecrf/include/Eigen/src/Core/products/GeneralBlockPanelKernel.h
0 → 100644
View file @
da2c1226
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
#define EIGEN_GENERAL_BLOCK_PANEL_H
namespace
Eigen
{
namespace
internal
{
template
<
typename
_LhsScalar
,
typename
_RhsScalar
,
bool
_ConjLhs
=
false
,
bool
_ConjRhs
=
false
>
class
gebp_traits
;
/** \internal \returns b if a<=0, and returns a otherwise. */
inline
std
::
ptrdiff_t
manage_caching_sizes_helper
(
std
::
ptrdiff_t
a
,
std
::
ptrdiff_t
b
)
{
return
a
<=
0
?
b
:
a
;
}
/** \internal */
inline
void
manage_caching_sizes
(
Action
action
,
std
::
ptrdiff_t
*
l1
=
0
,
std
::
ptrdiff_t
*
l2
=
0
)
{
static
std
::
ptrdiff_t
m_l1CacheSize
=
0
;
static
std
::
ptrdiff_t
m_l2CacheSize
=
0
;
if
(
m_l2CacheSize
==
0
)
{
m_l1CacheSize
=
manage_caching_sizes_helper
(
queryL1CacheSize
(),
8
*
1024
);
m_l2CacheSize
=
manage_caching_sizes_helper
(
queryTopLevelCacheSize
(),
1
*
1024
*
1024
);
}
if
(
action
==
SetAction
)
{
// set the cpu cache size and cache all block sizes from a global cache size in byte
eigen_internal_assert
(
l1
!=
0
&&
l2
!=
0
);
m_l1CacheSize
=
*
l1
;
m_l2CacheSize
=
*
l2
;
}
else
if
(
action
==
GetAction
)
{
eigen_internal_assert
(
l1
!=
0
&&
l2
!=
0
);
*
l1
=
m_l1CacheSize
;
*
l2
=
m_l2CacheSize
;
}
else
{
eigen_internal_assert
(
false
);
}
}
/** \brief Computes the blocking parameters for a m x k times k x n matrix product
*
* \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
* \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
* \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
*
* Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
* this function computes the blocking size parameters along the respective dimensions
* for matrix products and related algorithms. The blocking sizes depends on various
* parameters:
* - the L1 and L2 cache sizes,
* - the register level blocking sizes defined by gebp_traits,
* - the number of scalars that fit into a packet (when vectorization is enabled).
*
* \sa setCpuCacheSizes */
template
<
typename
LhsScalar
,
typename
RhsScalar
,
int
KcFactor
,
typename
SizeType
>
void
computeProductBlockingSizes
(
SizeType
&
k
,
SizeType
&
m
,
SizeType
&
n
)
{
EIGEN_UNUSED_VARIABLE
(
n
);
// Explanations:
// Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
// mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
// per kc x nr vertical small panels where nr is the blocking size along the n dimension
// at the register level. For vectorization purpose, these small vertical panels are unpacked,
// e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
// stay in L1 cache.
std
::
ptrdiff_t
l1
,
l2
;
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
>
Traits
;
enum
{
kdiv
=
KcFactor
*
2
*
Traits
::
nr
*
Traits
::
RhsProgress
*
sizeof
(
RhsScalar
),
mr
=
gebp_traits
<
LhsScalar
,
RhsScalar
>::
mr
,
mr_mask
=
(
0xffffffff
/
mr
)
*
mr
};
manage_caching_sizes
(
GetAction
,
&
l1
,
&
l2
);
k
=
std
::
min
<
SizeType
>
(
k
,
l1
/
kdiv
);
SizeType
_m
=
k
>
0
?
l2
/
(
4
*
sizeof
(
LhsScalar
)
*
k
)
:
0
;
if
(
_m
<
m
)
m
=
_m
&
mr_mask
;
}
template
<
typename
LhsScalar
,
typename
RhsScalar
,
typename
SizeType
>
inline
void
computeProductBlockingSizes
(
SizeType
&
k
,
SizeType
&
m
,
SizeType
&
n
)
{
computeProductBlockingSizes
<
LhsScalar
,
RhsScalar
,
1
>
(
k
,
m
,
n
);
}
#ifdef EIGEN_HAS_FUSE_CJMADD
#define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
#else
// FIXME (a bit overkill maybe ?)
template
<
typename
CJ
,
typename
A
,
typename
B
,
typename
C
,
typename
T
>
struct
gebp_madd_selector
{
EIGEN_ALWAYS_INLINE
static
void
run
(
const
CJ
&
cj
,
A
&
a
,
B
&
b
,
C
&
c
,
T
&
/*t*/
)
{
c
=
cj
.
pmadd
(
a
,
b
,
c
);
}
};
template
<
typename
CJ
,
typename
T
>
struct
gebp_madd_selector
<
CJ
,
T
,
T
,
T
,
T
>
{
EIGEN_ALWAYS_INLINE
static
void
run
(
const
CJ
&
cj
,
T
&
a
,
T
&
b
,
T
&
c
,
T
&
t
)
{
t
=
b
;
t
=
cj
.
pmul
(
a
,
t
);
c
=
padd
(
c
,
t
);
}
};
template
<
typename
CJ
,
typename
A
,
typename
B
,
typename
C
,
typename
T
>
EIGEN_STRONG_INLINE
void
gebp_madd
(
const
CJ
&
cj
,
A
&
a
,
B
&
b
,
C
&
c
,
T
&
t
)
{
gebp_madd_selector
<
CJ
,
A
,
B
,
C
,
T
>::
run
(
cj
,
a
,
b
,
c
,
t
);
}
#define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
#endif
/* Vectorization logic
* real*real: unpack rhs to constant packets, ...
*
* cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
* storing each res packet into two packets (2x2),
* at the end combine them: swap the second and addsub them
* cf*cf : same but with 2x4 blocks
* cplx*real : unpack rhs to constant packets, ...
* real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
*/
template
<
typename
_LhsScalar
,
typename
_RhsScalar
,
bool
_ConjLhs
,
bool
_ConjRhs
>
class
gebp_traits
{
public:
typedef
_LhsScalar
LhsScalar
;
typedef
_RhsScalar
RhsScalar
;
typedef
typename
scalar_product_traits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
enum
{
ConjLhs
=
_ConjLhs
,
ConjRhs
=
_ConjRhs
,
Vectorizable
=
packet_traits
<
LhsScalar
>::
Vectorizable
&&
packet_traits
<
RhsScalar
>::
Vectorizable
,
LhsPacketSize
=
Vectorizable
?
packet_traits
<
LhsScalar
>::
size
:
1
,
RhsPacketSize
=
Vectorizable
?
packet_traits
<
RhsScalar
>::
size
:
1
,
ResPacketSize
=
Vectorizable
?
packet_traits
<
ResScalar
>::
size
:
1
,
NumberOfRegisters
=
EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
,
// register block size along the N direction (must be either 2 or 4)
nr
=
NumberOfRegisters
/
4
,
// register block size along the M direction (currently, this one cannot be modified)
mr
=
2
*
LhsPacketSize
,
WorkSpaceFactor
=
nr
*
RhsPacketSize
,
LhsProgress
=
LhsPacketSize
,
RhsProgress
=
RhsPacketSize
};
typedef
typename
packet_traits
<
LhsScalar
>::
type
_LhsPacket
;
typedef
typename
packet_traits
<
RhsScalar
>::
type
_RhsPacket
;
typedef
typename
packet_traits
<
ResScalar
>::
type
_ResPacket
;
typedef
typename
conditional
<
Vectorizable
,
_LhsPacket
,
LhsScalar
>::
type
LhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
_RhsPacket
,
RhsScalar
>::
type
RhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
_ResPacket
,
ResScalar
>::
type
ResPacket
;
typedef
ResPacket
AccPacket
;
EIGEN_STRONG_INLINE
void
initAcc
(
AccPacket
&
p
)
{
p
=
pset1
<
ResPacket
>
(
ResScalar
(
0
));
}
EIGEN_STRONG_INLINE
void
unpackRhs
(
DenseIndex
n
,
const
RhsScalar
*
rhs
,
RhsScalar
*
b
)
{
for
(
DenseIndex
k
=
0
;
k
<
n
;
k
++
)
pstore1
<
RhsPacket
>
(
&
b
[
k
*
RhsPacketSize
],
rhs
[
k
]);
}
EIGEN_STRONG_INLINE
void
loadRhs
(
const
RhsScalar
*
b
,
RhsPacket
&
dest
)
const
{
dest
=
pload
<
RhsPacket
>
(
b
);
}
EIGEN_STRONG_INLINE
void
loadLhs
(
const
LhsScalar
*
a
,
LhsPacket
&
dest
)
const
{
dest
=
pload
<
LhsPacket
>
(
a
);
}
EIGEN_STRONG_INLINE
void
madd
(
const
LhsPacket
&
a
,
const
RhsPacket
&
b
,
AccPacket
&
c
,
AccPacket
&
tmp
)
const
{
tmp
=
b
;
tmp
=
pmul
(
a
,
tmp
);
c
=
padd
(
c
,
tmp
);
}
EIGEN_STRONG_INLINE
void
acc
(
const
AccPacket
&
c
,
const
ResPacket
&
alpha
,
ResPacket
&
r
)
const
{
r
=
pmadd
(
c
,
alpha
,
r
);
}
protected:
// conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
// conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
};
template
<
typename
RealScalar
,
bool
_ConjLhs
>
class
gebp_traits
<
std
::
complex
<
RealScalar
>
,
RealScalar
,
_ConjLhs
,
false
>
{
public:
typedef
std
::
complex
<
RealScalar
>
LhsScalar
;
typedef
RealScalar
RhsScalar
;
typedef
typename
scalar_product_traits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
enum
{
ConjLhs
=
_ConjLhs
,
ConjRhs
=
false
,
Vectorizable
=
packet_traits
<
LhsScalar
>::
Vectorizable
&&
packet_traits
<
RhsScalar
>::
Vectorizable
,
LhsPacketSize
=
Vectorizable
?
packet_traits
<
LhsScalar
>::
size
:
1
,
RhsPacketSize
=
Vectorizable
?
packet_traits
<
RhsScalar
>::
size
:
1
,
ResPacketSize
=
Vectorizable
?
packet_traits
<
ResScalar
>::
size
:
1
,
NumberOfRegisters
=
EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
,
nr
=
NumberOfRegisters
/
4
,
mr
=
2
*
LhsPacketSize
,
WorkSpaceFactor
=
nr
*
RhsPacketSize
,
LhsProgress
=
LhsPacketSize
,
RhsProgress
=
RhsPacketSize
};
typedef
typename
packet_traits
<
LhsScalar
>::
type
_LhsPacket
;
typedef
typename
packet_traits
<
RhsScalar
>::
type
_RhsPacket
;
typedef
typename
packet_traits
<
ResScalar
>::
type
_ResPacket
;
typedef
typename
conditional
<
Vectorizable
,
_LhsPacket
,
LhsScalar
>::
type
LhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
_RhsPacket
,
RhsScalar
>::
type
RhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
_ResPacket
,
ResScalar
>::
type
ResPacket
;
typedef
ResPacket
AccPacket
;
EIGEN_STRONG_INLINE
void
initAcc
(
AccPacket
&
p
)
{
p
=
pset1
<
ResPacket
>
(
ResScalar
(
0
));
}
EIGEN_STRONG_INLINE
void
unpackRhs
(
DenseIndex
n
,
const
RhsScalar
*
rhs
,
RhsScalar
*
b
)
{
for
(
DenseIndex
k
=
0
;
k
<
n
;
k
++
)
pstore1
<
RhsPacket
>
(
&
b
[
k
*
RhsPacketSize
],
rhs
[
k
]);
}
EIGEN_STRONG_INLINE
void
loadRhs
(
const
RhsScalar
*
b
,
RhsPacket
&
dest
)
const
{
dest
=
pload
<
RhsPacket
>
(
b
);
}
EIGEN_STRONG_INLINE
void
loadLhs
(
const
LhsScalar
*
a
,
LhsPacket
&
dest
)
const
{
dest
=
pload
<
LhsPacket
>
(
a
);
}
EIGEN_STRONG_INLINE
void
madd
(
const
LhsPacket
&
a
,
const
RhsPacket
&
b
,
AccPacket
&
c
,
RhsPacket
&
tmp
)
const
{
madd_impl
(
a
,
b
,
c
,
tmp
,
typename
conditional
<
Vectorizable
,
true_type
,
false_type
>::
type
());
}
EIGEN_STRONG_INLINE
void
madd_impl
(
const
LhsPacket
&
a
,
const
RhsPacket
&
b
,
AccPacket
&
c
,
RhsPacket
&
tmp
,
const
true_type
&
)
const
{
tmp
=
b
;
tmp
=
pmul
(
a
.
v
,
tmp
);
c
.
v
=
padd
(
c
.
v
,
tmp
);
}
EIGEN_STRONG_INLINE
void
madd_impl
(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
,
ResScalar
&
c
,
RhsScalar
&
/*tmp*/
,
const
false_type
&
)
const
{
c
+=
a
*
b
;
}
EIGEN_STRONG_INLINE
void
acc
(
const
AccPacket
&
c
,
const
ResPacket
&
alpha
,
ResPacket
&
r
)
const
{
r
=
cj
.
pmadd
(
c
,
alpha
,
r
);
}
protected:
conj_helper
<
ResPacket
,
ResPacket
,
ConjLhs
,
false
>
cj
;
};
template
<
typename
RealScalar
,
bool
_ConjLhs
,
bool
_ConjRhs
>
class
gebp_traits
<
std
::
complex
<
RealScalar
>
,
std
::
complex
<
RealScalar
>
,
_ConjLhs
,
_ConjRhs
>
{
public:
typedef
std
::
complex
<
RealScalar
>
Scalar
;
typedef
std
::
complex
<
RealScalar
>
LhsScalar
;
typedef
std
::
complex
<
RealScalar
>
RhsScalar
;
typedef
std
::
complex
<
RealScalar
>
ResScalar
;
enum
{
ConjLhs
=
_ConjLhs
,
ConjRhs
=
_ConjRhs
,
Vectorizable
=
packet_traits
<
RealScalar
>::
Vectorizable
&&
packet_traits
<
Scalar
>::
Vectorizable
,
RealPacketSize
=
Vectorizable
?
packet_traits
<
RealScalar
>::
size
:
1
,
ResPacketSize
=
Vectorizable
?
packet_traits
<
ResScalar
>::
size
:
1
,
nr
=
2
,
mr
=
2
*
ResPacketSize
,
WorkSpaceFactor
=
Vectorizable
?
2
*
nr
*
RealPacketSize
:
nr
,
LhsProgress
=
ResPacketSize
,
RhsProgress
=
Vectorizable
?
2
*
ResPacketSize
:
1
};
typedef
typename
packet_traits
<
RealScalar
>::
type
RealPacket
;
typedef
typename
packet_traits
<
Scalar
>::
type
ScalarPacket
;
struct
DoublePacket
{
RealPacket
first
;
RealPacket
second
;
};
typedef
typename
conditional
<
Vectorizable
,
RealPacket
,
Scalar
>::
type
LhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
DoublePacket
,
Scalar
>::
type
RhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
ScalarPacket
,
Scalar
>::
type
ResPacket
;
typedef
typename
conditional
<
Vectorizable
,
DoublePacket
,
Scalar
>::
type
AccPacket
;
EIGEN_STRONG_INLINE
void
initAcc
(
Scalar
&
p
)
{
p
=
Scalar
(
0
);
}
EIGEN_STRONG_INLINE
void
initAcc
(
DoublePacket
&
p
)
{
p
.
first
=
pset1
<
RealPacket
>
(
RealScalar
(
0
));
p
.
second
=
pset1
<
RealPacket
>
(
RealScalar
(
0
));
}
/* Unpack the rhs coeff such that each complex coefficient is spread into
* two packects containing respectively the real and imaginary coefficient
* duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y]
*/
EIGEN_STRONG_INLINE
void
unpackRhs
(
DenseIndex
n
,
const
Scalar
*
rhs
,
Scalar
*
b
)
{
for
(
DenseIndex
k
=
0
;
k
<
n
;
k
++
)
{
if
(
Vectorizable
)
{
pstore1
<
RealPacket
>
((
RealScalar
*
)
&
b
[
k
*
ResPacketSize
*
2
+
0
],
real
(
rhs
[
k
]));
pstore1
<
RealPacket
>
((
RealScalar
*
)
&
b
[
k
*
ResPacketSize
*
2
+
ResPacketSize
],
imag
(
rhs
[
k
]));
}
else
b
[
k
]
=
rhs
[
k
];
}
}
EIGEN_STRONG_INLINE
void
loadRhs
(
const
RhsScalar
*
b
,
ResPacket
&
dest
)
const
{
dest
=
*
b
;
}
EIGEN_STRONG_INLINE
void
loadRhs
(
const
RhsScalar
*
b
,
DoublePacket
&
dest
)
const
{
dest
.
first
=
pload
<
RealPacket
>
((
const
RealScalar
*
)
b
);
dest
.
second
=
pload
<
RealPacket
>
((
const
RealScalar
*
)(
b
+
ResPacketSize
));
}
// nothing special here
EIGEN_STRONG_INLINE
void
loadLhs
(
const
LhsScalar
*
a
,
LhsPacket
&
dest
)
const
{
dest
=
pload
<
LhsPacket
>
((
const
typename
unpacket_traits
<
LhsPacket
>::
type
*
)(
a
));
}
EIGEN_STRONG_INLINE
void
madd
(
const
LhsPacket
&
a
,
const
RhsPacket
&
b
,
DoublePacket
&
c
,
RhsPacket
&
/*tmp*/
)
const
{
c
.
first
=
padd
(
pmul
(
a
,
b
.
first
),
c
.
first
);
c
.
second
=
padd
(
pmul
(
a
,
b
.
second
),
c
.
second
);
}
EIGEN_STRONG_INLINE
void
madd
(
const
LhsPacket
&
a
,
const
RhsPacket
&
b
,
ResPacket
&
c
,
RhsPacket
&
/*tmp*/
)
const
{
c
=
cj
.
pmadd
(
a
,
b
,
c
);
}
EIGEN_STRONG_INLINE
void
acc
(
const
Scalar
&
c
,
const
Scalar
&
alpha
,
Scalar
&
r
)
const
{
r
+=
alpha
*
c
;
}
EIGEN_STRONG_INLINE
void
acc
(
const
DoublePacket
&
c
,
const
ResPacket
&
alpha
,
ResPacket
&
r
)
const
{
// assemble c
ResPacket
tmp
;
if
((
!
ConjLhs
)
&&
(
!
ConjRhs
))
{
tmp
=
pcplxflip
(
pconj
(
ResPacket
(
c
.
second
)));
tmp
=
padd
(
ResPacket
(
c
.
first
),
tmp
);
}
else
if
((
!
ConjLhs
)
&&
(
ConjRhs
))
{
tmp
=
pconj
(
pcplxflip
(
ResPacket
(
c
.
second
)));
tmp
=
padd
(
ResPacket
(
c
.
first
),
tmp
);
}
else
if
((
ConjLhs
)
&&
(
!
ConjRhs
))
{
tmp
=
pcplxflip
(
ResPacket
(
c
.
second
));
tmp
=
padd
(
pconj
(
ResPacket
(
c
.
first
)),
tmp
);
}
else
if
((
ConjLhs
)
&&
(
ConjRhs
))
{
tmp
=
pcplxflip
(
ResPacket
(
c
.
second
));
tmp
=
psub
(
pconj
(
ResPacket
(
c
.
first
)),
tmp
);
}
r
=
pmadd
(
tmp
,
alpha
,
r
);
}
protected:
conj_helper
<
LhsScalar
,
RhsScalar
,
ConjLhs
,
ConjRhs
>
cj
;
};
template
<
typename
RealScalar
,
bool
_ConjRhs
>
class
gebp_traits
<
RealScalar
,
std
::
complex
<
RealScalar
>
,
false
,
_ConjRhs
>
{
public:
typedef
std
::
complex
<
RealScalar
>
Scalar
;
typedef
RealScalar
LhsScalar
;
typedef
Scalar
RhsScalar
;
typedef
Scalar
ResScalar
;
enum
{
ConjLhs
=
false
,
ConjRhs
=
_ConjRhs
,
Vectorizable
=
packet_traits
<
RealScalar
>::
Vectorizable
&&
packet_traits
<
Scalar
>::
Vectorizable
,
LhsPacketSize
=
Vectorizable
?
packet_traits
<
LhsScalar
>::
size
:
1
,
RhsPacketSize
=
Vectorizable
?
packet_traits
<
RhsScalar
>::
size
:
1
,
ResPacketSize
=
Vectorizable
?
packet_traits
<
ResScalar
>::
size
:
1
,
NumberOfRegisters
=
EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
,
nr
=
4
,
mr
=
2
*
ResPacketSize
,
WorkSpaceFactor
=
nr
*
RhsPacketSize
,
LhsProgress
=
ResPacketSize
,
RhsProgress
=
ResPacketSize
};
typedef
typename
packet_traits
<
LhsScalar
>::
type
_LhsPacket
;
typedef
typename
packet_traits
<
RhsScalar
>::
type
_RhsPacket
;
typedef
typename
packet_traits
<
ResScalar
>::
type
_ResPacket
;
typedef
typename
conditional
<
Vectorizable
,
_LhsPacket
,
LhsScalar
>::
type
LhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
_RhsPacket
,
RhsScalar
>::
type
RhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
_ResPacket
,
ResScalar
>::
type
ResPacket
;
typedef
ResPacket
AccPacket
;
EIGEN_STRONG_INLINE
void
initAcc
(
AccPacket
&
p
)
{
p
=
pset1
<
ResPacket
>
(
ResScalar
(
0
));
}
EIGEN_STRONG_INLINE
void
unpackRhs
(
DenseIndex
n
,
const
RhsScalar
*
rhs
,
RhsScalar
*
b
)
{
for
(
DenseIndex
k
=
0
;
k
<
n
;
k
++
)
pstore1
<
RhsPacket
>
(
&
b
[
k
*
RhsPacketSize
],
rhs
[
k
]);
}
EIGEN_STRONG_INLINE
void
loadRhs
(
const
RhsScalar
*
b
,
RhsPacket
&
dest
)
const
{
dest
=
pload
<
RhsPacket
>
(
b
);
}
EIGEN_STRONG_INLINE
void
loadLhs
(
const
LhsScalar
*
a
,
LhsPacket
&
dest
)
const
{
dest
=
ploaddup
<
LhsPacket
>
(
a
);
}
EIGEN_STRONG_INLINE
void
madd
(
const
LhsPacket
&
a
,
const
RhsPacket
&
b
,
AccPacket
&
c
,
RhsPacket
&
tmp
)
const
{
madd_impl
(
a
,
b
,
c
,
tmp
,
typename
conditional
<
Vectorizable
,
true_type
,
false_type
>::
type
());
}
EIGEN_STRONG_INLINE
void
madd_impl
(
const
LhsPacket
&
a
,
const
RhsPacket
&
b
,
AccPacket
&
c
,
RhsPacket
&
tmp
,
const
true_type
&
)
const
{
tmp
=
b
;
tmp
.
v
=
pmul
(
a
,
tmp
.
v
);
c
=
padd
(
c
,
tmp
);
}
EIGEN_STRONG_INLINE
void
madd_impl
(
const
LhsScalar
&
a
,
const
RhsScalar
&
b
,
ResScalar
&
c
,
RhsScalar
&
/*tmp*/
,
const
false_type
&
)
const
{
c
+=
a
*
b
;
}
EIGEN_STRONG_INLINE
void
acc
(
const
AccPacket
&
c
,
const
ResPacket
&
alpha
,
ResPacket
&
r
)
const
{
r
=
cj
.
pmadd
(
alpha
,
c
,
r
);
}
protected:
conj_helper
<
ResPacket
,
ResPacket
,
false
,
ConjRhs
>
cj
;
};
/* optimized GEneral packed Block * packed Panel product kernel
*
* Mixing type logic: C += A * B
* | A | B | comments
* |real |cplx | no vectorization yet, would require to pack A with duplication
* |cplx |real | easy vectorization
*/
template
<
typename
LhsScalar
,
typename
RhsScalar
,
typename
Index
,
int
mr
,
int
nr
,
bool
ConjugateLhs
,
bool
ConjugateRhs
>
struct
gebp_kernel
{
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
,
ConjugateLhs
,
ConjugateRhs
>
Traits
;
typedef
typename
Traits
::
ResScalar
ResScalar
;
typedef
typename
Traits
::
LhsPacket
LhsPacket
;
typedef
typename
Traits
::
RhsPacket
RhsPacket
;
typedef
typename
Traits
::
ResPacket
ResPacket
;
typedef
typename
Traits
::
AccPacket
AccPacket
;
enum
{
Vectorizable
=
Traits
::
Vectorizable
,
LhsProgress
=
Traits
::
LhsProgress
,
RhsProgress
=
Traits
::
RhsProgress
,
ResPacketSize
=
Traits
::
ResPacketSize
};
EIGEN_DONT_INLINE
EIGEN_FLATTEN_ATTRIB
void
operator
()(
ResScalar
*
res
,
Index
resStride
,
const
LhsScalar
*
blockA
,
const
RhsScalar
*
blockB
,
Index
rows
,
Index
depth
,
Index
cols
,
ResScalar
alpha
,
Index
strideA
=-
1
,
Index
strideB
=-
1
,
Index
offsetA
=
0
,
Index
offsetB
=
0
,
RhsScalar
*
unpackedB
=
0
)
{
Traits
traits
;
if
(
strideA
==-
1
)
strideA
=
depth
;
if
(
strideB
==-
1
)
strideB
=
depth
;
conj_helper
<
LhsScalar
,
RhsScalar
,
ConjugateLhs
,
ConjugateRhs
>
cj
;
// conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
Index
packet_cols
=
(
cols
/
nr
)
*
nr
;
const
Index
peeled_mc
=
(
rows
/
mr
)
*
mr
;
// FIXME:
const
Index
peeled_mc2
=
peeled_mc
+
(
rows
-
peeled_mc
>=
LhsProgress
?
LhsProgress
:
0
);
const
Index
peeled_kc
=
(
depth
/
4
)
*
4
;
if
(
unpackedB
==
0
)
unpackedB
=
const_cast
<
RhsScalar
*>
(
blockB
-
strideB
*
nr
*
RhsProgress
);
// loops on each micro vertical panel of rhs (depth x nr)
for
(
Index
j2
=
0
;
j2
<
packet_cols
;
j2
+=
nr
)
{
traits
.
unpackRhs
(
depth
*
nr
,
&
blockB
[
j2
*
strideB
+
offsetB
*
nr
],
unpackedB
);
// loops on each largest micro horizontal panel of lhs (mr x depth)
// => we select a mr x nr micro block of res which is entirely
// stored into mr/packet_size x nr registers.
for
(
Index
i
=
0
;
i
<
peeled_mc
;
i
+=
mr
)
{
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
*
mr
];
prefetch
(
&
blA
[
0
]);
// gets res block as register
AccPacket
C0
,
C1
,
C2
,
C3
,
C4
,
C5
,
C6
,
C7
;
traits
.
initAcc
(
C0
);
traits
.
initAcc
(
C1
);
if
(
nr
==
4
)
traits
.
initAcc
(
C2
);
if
(
nr
==
4
)
traits
.
initAcc
(
C3
);
traits
.
initAcc
(
C4
);
traits
.
initAcc
(
C5
);
if
(
nr
==
4
)
traits
.
initAcc
(
C6
);
if
(
nr
==
4
)
traits
.
initAcc
(
C7
);
ResScalar
*
r0
=
&
res
[(
j2
+
0
)
*
resStride
+
i
];
ResScalar
*
r1
=
r0
+
resStride
;
ResScalar
*
r2
=
r1
+
resStride
;
ResScalar
*
r3
=
r2
+
resStride
;
prefetch
(
r0
+
16
);
prefetch
(
r1
+
16
);
prefetch
(
r2
+
16
);
prefetch
(
r3
+
16
);
// performs "inner" product
// TODO let's check wether the folowing peeled loop could not be
// optimized via optimal prefetching from one loop to the other
const
RhsScalar
*
blB
=
unpackedB
;
for
(
Index
k
=
0
;
k
<
peeled_kc
;
k
+=
4
)
{
if
(
nr
==
2
)
{
LhsPacket
A0
,
A1
;
RhsPacket
B_0
;
RhsPacket
T0
;
EIGEN_ASM_COMMENT
(
"mybegin2"
);
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
1
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C1
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C5
,
B_0
);
traits
.
loadLhs
(
&
blA
[
2
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
3
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
2
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
3
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C1
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C5
,
B_0
);
traits
.
loadLhs
(
&
blA
[
4
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
5
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
4
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
5
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C1
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C5
,
B_0
);
traits
.
loadLhs
(
&
blA
[
6
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
7
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
6
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
7
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C1
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C5
,
B_0
);
EIGEN_ASM_COMMENT
(
"myend"
);
}
else
{
EIGEN_ASM_COMMENT
(
"mybegin4"
);
LhsPacket
A0
,
A1
;
RhsPacket
B_0
,
B1
,
B2
,
B3
;
RhsPacket
T0
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
1
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
loadRhs
(
&
blB
[
2
*
RhsProgress
],
B2
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
3
*
RhsProgress
],
B3
);
traits
.
loadRhs
(
&
blB
[
4
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
T0
);
traits
.
madd
(
A1
,
B1
,
C5
,
B1
);
traits
.
loadRhs
(
&
blB
[
5
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
T0
);
traits
.
madd
(
A1
,
B2
,
C6
,
B2
);
traits
.
loadRhs
(
&
blB
[
6
*
RhsProgress
],
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
T0
);
traits
.
loadLhs
(
&
blA
[
2
*
LhsProgress
],
A0
);
traits
.
madd
(
A1
,
B3
,
C7
,
B3
);
traits
.
loadLhs
(
&
blA
[
3
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
7
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
8
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
T0
);
traits
.
madd
(
A1
,
B1
,
C5
,
B1
);
traits
.
loadRhs
(
&
blB
[
9
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
T0
);
traits
.
madd
(
A1
,
B2
,
C6
,
B2
);
traits
.
loadRhs
(
&
blB
[
10
*
RhsProgress
],
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
T0
);
traits
.
loadLhs
(
&
blA
[
4
*
LhsProgress
],
A0
);
traits
.
madd
(
A1
,
B3
,
C7
,
B3
);
traits
.
loadLhs
(
&
blA
[
5
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
11
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
12
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
T0
);
traits
.
madd
(
A1
,
B1
,
C5
,
B1
);
traits
.
loadRhs
(
&
blB
[
13
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
T0
);
traits
.
madd
(
A1
,
B2
,
C6
,
B2
);
traits
.
loadRhs
(
&
blB
[
14
*
RhsProgress
],
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
T0
);
traits
.
loadLhs
(
&
blA
[
6
*
LhsProgress
],
A0
);
traits
.
madd
(
A1
,
B3
,
C7
,
B3
);
traits
.
loadLhs
(
&
blA
[
7
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
15
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
T0
);
traits
.
madd
(
A1
,
B1
,
C5
,
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
T0
);
traits
.
madd
(
A1
,
B2
,
C6
,
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
T0
);
traits
.
madd
(
A1
,
B3
,
C7
,
B3
);
}
blB
+=
4
*
nr
*
RhsProgress
;
blA
+=
4
*
mr
;
}
// process remaining peeled loop
for
(
Index
k
=
peeled_kc
;
k
<
depth
;
k
++
)
{
if
(
nr
==
2
)
{
LhsPacket
A0
,
A1
;
RhsPacket
B_0
;
RhsPacket
T0
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
1
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C1
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C5
,
B_0
);
}
else
{
LhsPacket
A0
,
A1
;
RhsPacket
B_0
,
B1
,
B2
,
B3
;
RhsPacket
T0
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
1
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
loadRhs
(
&
blB
[
2
*
RhsProgress
],
B2
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
traits
.
loadRhs
(
&
blB
[
3
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B1
,
C1
,
T0
);
traits
.
madd
(
A1
,
B1
,
C5
,
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
T0
);
traits
.
madd
(
A1
,
B2
,
C6
,
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
T0
);
traits
.
madd
(
A1
,
B3
,
C7
,
B3
);
}
blB
+=
nr
*
RhsProgress
;
blA
+=
mr
;
}
if
(
nr
==
4
)
{
ResPacket
R0
,
R1
,
R2
,
R3
,
R4
,
R5
,
R6
;
ResPacket
alphav
=
pset1
<
ResPacket
>
(
alpha
);
R0
=
ploadu
<
ResPacket
>
(
r0
);
R1
=
ploadu
<
ResPacket
>
(
r1
);
R2
=
ploadu
<
ResPacket
>
(
r2
);
R3
=
ploadu
<
ResPacket
>
(
r3
);
R4
=
ploadu
<
ResPacket
>
(
r0
+
ResPacketSize
);
R5
=
ploadu
<
ResPacket
>
(
r1
+
ResPacketSize
);
R6
=
ploadu
<
ResPacket
>
(
r2
+
ResPacketSize
);
traits
.
acc
(
C0
,
alphav
,
R0
);
pstoreu
(
r0
,
R0
);
R0
=
ploadu
<
ResPacket
>
(
r3
+
ResPacketSize
);
traits
.
acc
(
C1
,
alphav
,
R1
);
traits
.
acc
(
C2
,
alphav
,
R2
);
traits
.
acc
(
C3
,
alphav
,
R3
);
traits
.
acc
(
C4
,
alphav
,
R4
);
traits
.
acc
(
C5
,
alphav
,
R5
);
traits
.
acc
(
C6
,
alphav
,
R6
);
traits
.
acc
(
C7
,
alphav
,
R0
);
pstoreu
(
r1
,
R1
);
pstoreu
(
r2
,
R2
);
pstoreu
(
r3
,
R3
);
pstoreu
(
r0
+
ResPacketSize
,
R4
);
pstoreu
(
r1
+
ResPacketSize
,
R5
);
pstoreu
(
r2
+
ResPacketSize
,
R6
);
pstoreu
(
r3
+
ResPacketSize
,
R0
);
}
else
{
ResPacket
R0
,
R1
,
R4
;
ResPacket
alphav
=
pset1
<
ResPacket
>
(
alpha
);
R0
=
ploadu
<
ResPacket
>
(
r0
);
R1
=
ploadu
<
ResPacket
>
(
r1
);
R4
=
ploadu
<
ResPacket
>
(
r0
+
ResPacketSize
);
traits
.
acc
(
C0
,
alphav
,
R0
);
pstoreu
(
r0
,
R0
);
R0
=
ploadu
<
ResPacket
>
(
r1
+
ResPacketSize
);
traits
.
acc
(
C1
,
alphav
,
R1
);
traits
.
acc
(
C4
,
alphav
,
R4
);
traits
.
acc
(
C5
,
alphav
,
R0
);
pstoreu
(
r1
,
R1
);
pstoreu
(
r0
+
ResPacketSize
,
R4
);
pstoreu
(
r1
+
ResPacketSize
,
R0
);
}
}
if
(
rows
-
peeled_mc
>=
LhsProgress
)
{
Index
i
=
peeled_mc
;
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
*
LhsProgress
];
prefetch
(
&
blA
[
0
]);
// gets res block as register
AccPacket
C0
,
C1
,
C2
,
C3
;
traits
.
initAcc
(
C0
);
traits
.
initAcc
(
C1
);
if
(
nr
==
4
)
traits
.
initAcc
(
C2
);
if
(
nr
==
4
)
traits
.
initAcc
(
C3
);
// performs "inner" product
const
RhsScalar
*
blB
=
unpackedB
;
for
(
Index
k
=
0
;
k
<
peeled_kc
;
k
+=
4
)
{
if
(
nr
==
2
)
{
LhsPacket
A0
;
RhsPacket
B_0
,
B1
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
loadRhs
(
&
blB
[
2
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
loadLhs
(
&
blA
[
1
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
3
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
loadRhs
(
&
blB
[
4
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
loadLhs
(
&
blA
[
2
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
5
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
loadRhs
(
&
blB
[
6
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
loadLhs
(
&
blA
[
3
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
7
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
}
else
{
LhsPacket
A0
;
RhsPacket
B_0
,
B1
,
B2
,
B3
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
loadRhs
(
&
blB
[
2
*
RhsProgress
],
B2
);
traits
.
loadRhs
(
&
blB
[
3
*
RhsProgress
],
B3
);
traits
.
loadRhs
(
&
blB
[
4
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
loadRhs
(
&
blB
[
5
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
B2
);
traits
.
loadRhs
(
&
blB
[
6
*
RhsProgress
],
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
B3
);
traits
.
loadLhs
(
&
blA
[
1
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
7
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
loadRhs
(
&
blB
[
8
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
loadRhs
(
&
blB
[
9
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
B2
);
traits
.
loadRhs
(
&
blB
[
10
*
RhsProgress
],
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
B3
);
traits
.
loadLhs
(
&
blA
[
2
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
11
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
loadRhs
(
&
blB
[
12
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
loadRhs
(
&
blB
[
13
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
B2
);
traits
.
loadRhs
(
&
blB
[
14
*
RhsProgress
],
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
B3
);
traits
.
loadLhs
(
&
blA
[
3
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
15
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
B3
);
}
blB
+=
nr
*
4
*
RhsProgress
;
blA
+=
4
*
LhsProgress
;
}
// process remaining peeled loop
for
(
Index
k
=
peeled_kc
;
k
<
depth
;
k
++
)
{
if
(
nr
==
2
)
{
LhsPacket
A0
;
RhsPacket
B_0
,
B1
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B1
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
}
else
{
LhsPacket
A0
;
RhsPacket
B_0
,
B1
,
B2
,
B3
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
loadRhs
(
&
blB
[
1
*
RhsProgress
],
B1
);
traits
.
loadRhs
(
&
blB
[
2
*
RhsProgress
],
B2
);
traits
.
loadRhs
(
&
blB
[
3
*
RhsProgress
],
B3
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
traits
.
madd
(
A0
,
B1
,
C1
,
B1
);
traits
.
madd
(
A0
,
B2
,
C2
,
B2
);
traits
.
madd
(
A0
,
B3
,
C3
,
B3
);
}
blB
+=
nr
*
RhsProgress
;
blA
+=
LhsProgress
;
}
ResPacket
R0
,
R1
,
R2
,
R3
;
ResPacket
alphav
=
pset1
<
ResPacket
>
(
alpha
);
ResScalar
*
r0
=
&
res
[(
j2
+
0
)
*
resStride
+
i
];
ResScalar
*
r1
=
r0
+
resStride
;
ResScalar
*
r2
=
r1
+
resStride
;
ResScalar
*
r3
=
r2
+
resStride
;
R0
=
ploadu
<
ResPacket
>
(
r0
);
R1
=
ploadu
<
ResPacket
>
(
r1
);
if
(
nr
==
4
)
R2
=
ploadu
<
ResPacket
>
(
r2
);
if
(
nr
==
4
)
R3
=
ploadu
<
ResPacket
>
(
r3
);
traits
.
acc
(
C0
,
alphav
,
R0
);
traits
.
acc
(
C1
,
alphav
,
R1
);
if
(
nr
==
4
)
traits
.
acc
(
C2
,
alphav
,
R2
);
if
(
nr
==
4
)
traits
.
acc
(
C3
,
alphav
,
R3
);
pstoreu
(
r0
,
R0
);
pstoreu
(
r1
,
R1
);
if
(
nr
==
4
)
pstoreu
(
r2
,
R2
);
if
(
nr
==
4
)
pstoreu
(
r3
,
R3
);
}
for
(
Index
i
=
peeled_mc2
;
i
<
rows
;
i
++
)
{
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
];
prefetch
(
&
blA
[
0
]);
// gets a 1 x nr res block as registers
ResScalar
C0
(
0
),
C1
(
0
),
C2
(
0
),
C3
(
0
);
// TODO directly use blockB ???
const
RhsScalar
*
blB
=
&
blockB
[
j2
*
strideB
+
offsetB
*
nr
];
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
if
(
nr
==
2
)
{
LhsScalar
A0
;
RhsScalar
B_0
,
B1
;
A0
=
blA
[
k
];
B_0
=
blB
[
0
];
B1
=
blB
[
1
];
MADD
(
cj
,
A0
,
B_0
,
C0
,
B_0
);
MADD
(
cj
,
A0
,
B1
,
C1
,
B1
);
}
else
{
LhsScalar
A0
;
RhsScalar
B_0
,
B1
,
B2
,
B3
;
A0
=
blA
[
k
];
B_0
=
blB
[
0
];
B1
=
blB
[
1
];
B2
=
blB
[
2
];
B3
=
blB
[
3
];
MADD
(
cj
,
A0
,
B_0
,
C0
,
B_0
);
MADD
(
cj
,
A0
,
B1
,
C1
,
B1
);
MADD
(
cj
,
A0
,
B2
,
C2
,
B2
);
MADD
(
cj
,
A0
,
B3
,
C3
,
B3
);
}
blB
+=
nr
;
}
res
[(
j2
+
0
)
*
resStride
+
i
]
+=
alpha
*
C0
;
res
[(
j2
+
1
)
*
resStride
+
i
]
+=
alpha
*
C1
;
if
(
nr
==
4
)
res
[(
j2
+
2
)
*
resStride
+
i
]
+=
alpha
*
C2
;
if
(
nr
==
4
)
res
[(
j2
+
3
)
*
resStride
+
i
]
+=
alpha
*
C3
;
}
}
// process remaining rhs/res columns one at a time
// => do the same but with nr==1
for
(
Index
j2
=
packet_cols
;
j2
<
cols
;
j2
++
)
{
// unpack B
traits
.
unpackRhs
(
depth
,
&
blockB
[
j2
*
strideB
+
offsetB
],
unpackedB
);
for
(
Index
i
=
0
;
i
<
peeled_mc
;
i
+=
mr
)
{
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
*
mr
];
prefetch
(
&
blA
[
0
]);
// TODO move the res loads to the stores
// get res block as registers
AccPacket
C0
,
C4
;
traits
.
initAcc
(
C0
);
traits
.
initAcc
(
C4
);
const
RhsScalar
*
blB
=
unpackedB
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
LhsPacket
A0
,
A1
;
RhsPacket
B_0
;
RhsPacket
T0
;
traits
.
loadLhs
(
&
blA
[
0
*
LhsProgress
],
A0
);
traits
.
loadLhs
(
&
blA
[
1
*
LhsProgress
],
A1
);
traits
.
loadRhs
(
&
blB
[
0
*
RhsProgress
],
B_0
);
traits
.
madd
(
A0
,
B_0
,
C0
,
T0
);
traits
.
madd
(
A1
,
B_0
,
C4
,
B_0
);
blB
+=
RhsProgress
;
blA
+=
2
*
LhsProgress
;
}
ResPacket
R0
,
R4
;
ResPacket
alphav
=
pset1
<
ResPacket
>
(
alpha
);
ResScalar
*
r0
=
&
res
[(
j2
+
0
)
*
resStride
+
i
];
R0
=
ploadu
<
ResPacket
>
(
r0
);
R4
=
ploadu
<
ResPacket
>
(
r0
+
ResPacketSize
);
traits
.
acc
(
C0
,
alphav
,
R0
);
traits
.
acc
(
C4
,
alphav
,
R4
);
pstoreu
(
r0
,
R0
);
pstoreu
(
r0
+
ResPacketSize
,
R4
);
}
if
(
rows
-
peeled_mc
>=
LhsProgress
)
{
Index
i
=
peeled_mc
;
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
*
LhsProgress
];
prefetch
(
&
blA
[
0
]);
AccPacket
C0
;
traits
.
initAcc
(
C0
);
const
RhsScalar
*
blB
=
unpackedB
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
LhsPacket
A0
;
RhsPacket
B_0
;
traits
.
loadLhs
(
blA
,
A0
);
traits
.
loadRhs
(
blB
,
B_0
);
traits
.
madd
(
A0
,
B_0
,
C0
,
B_0
);
blB
+=
RhsProgress
;
blA
+=
LhsProgress
;
}
ResPacket
alphav
=
pset1
<
ResPacket
>
(
alpha
);
ResPacket
R0
=
ploadu
<
ResPacket
>
(
&
res
[(
j2
+
0
)
*
resStride
+
i
]);
traits
.
acc
(
C0
,
alphav
,
R0
);
pstoreu
(
&
res
[(
j2
+
0
)
*
resStride
+
i
],
R0
);
}
for
(
Index
i
=
peeled_mc2
;
i
<
rows
;
i
++
)
{
const
LhsScalar
*
blA
=
&
blockA
[
i
*
strideA
+
offsetA
];
prefetch
(
&
blA
[
0
]);
// gets a 1 x 1 res block as registers
ResScalar
C0
(
0
);
// FIXME directly use blockB ??
const
RhsScalar
*
blB
=
&
blockB
[
j2
*
strideB
+
offsetB
];
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
LhsScalar
A0
=
blA
[
k
];
RhsScalar
B_0
=
blB
[
k
];
MADD
(
cj
,
A0
,
B_0
,
C0
,
B_0
);
}
res
[(
j2
+
0
)
*
resStride
+
i
]
+=
alpha
*
C0
;
}
}
}
};
#undef CJMADD
// pack a block of the lhs
// The traversal is as follow (mr==4):
// 0 4 8 12 ...
// 1 5 9 13 ...
// 2 6 10 14 ...
// 3 7 11 15 ...
//
// 16 20 24 28 ...
// 17 21 25 29 ...
// 18 22 26 30 ...
// 19 23 27 31 ...
//
// 32 33 34 35 ...
// 36 36 38 39 ...
template
<
typename
Scalar
,
typename
Index
,
int
Pack1
,
int
Pack2
,
int
StorageOrder
,
bool
Conjugate
,
bool
PanelMode
>
struct
gemm_pack_lhs
{
EIGEN_DONT_INLINE
void
operator
()(
Scalar
*
blockA
,
const
Scalar
*
EIGEN_RESTRICT
_lhs
,
Index
lhsStride
,
Index
depth
,
Index
rows
,
Index
stride
=
0
,
Index
offset
=
0
)
{
typedef
typename
packet_traits
<
Scalar
>::
type
Packet
;
enum
{
PacketSize
=
packet_traits
<
Scalar
>::
size
};
EIGEN_ASM_COMMENT
(
"EIGEN PRODUCT PACK LHS"
);
eigen_assert
(((
!
PanelMode
)
&&
stride
==
0
&&
offset
==
0
)
||
(
PanelMode
&&
stride
>=
depth
&&
offset
<=
stride
));
eigen_assert
(
(
StorageOrder
==
RowMajor
)
||
((
Pack1
%
PacketSize
)
==
0
&&
Pack1
<=
4
*
PacketSize
)
);
conj_if
<
NumTraits
<
Scalar
>::
IsComplex
&&
Conjugate
>
cj
;
const_blas_data_mapper
<
Scalar
,
Index
,
StorageOrder
>
lhs
(
_lhs
,
lhsStride
);
Index
count
=
0
;
Index
peeled_mc
=
(
rows
/
Pack1
)
*
Pack1
;
for
(
Index
i
=
0
;
i
<
peeled_mc
;
i
+=
Pack1
)
{
if
(
PanelMode
)
count
+=
Pack1
*
offset
;
if
(
StorageOrder
==
ColMajor
)
{
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
Packet
A
,
B
,
C
,
D
;
if
(
Pack1
>=
1
*
PacketSize
)
A
=
ploadu
<
Packet
>
(
&
lhs
(
i
+
0
*
PacketSize
,
k
));
if
(
Pack1
>=
2
*
PacketSize
)
B
=
ploadu
<
Packet
>
(
&
lhs
(
i
+
1
*
PacketSize
,
k
));
if
(
Pack1
>=
3
*
PacketSize
)
C
=
ploadu
<
Packet
>
(
&
lhs
(
i
+
2
*
PacketSize
,
k
));
if
(
Pack1
>=
4
*
PacketSize
)
D
=
ploadu
<
Packet
>
(
&
lhs
(
i
+
3
*
PacketSize
,
k
));
if
(
Pack1
>=
1
*
PacketSize
)
{
pstore
(
blockA
+
count
,
cj
.
pconj
(
A
));
count
+=
PacketSize
;
}
if
(
Pack1
>=
2
*
PacketSize
)
{
pstore
(
blockA
+
count
,
cj
.
pconj
(
B
));
count
+=
PacketSize
;
}
if
(
Pack1
>=
3
*
PacketSize
)
{
pstore
(
blockA
+
count
,
cj
.
pconj
(
C
));
count
+=
PacketSize
;
}
if
(
Pack1
>=
4
*
PacketSize
)
{
pstore
(
blockA
+
count
,
cj
.
pconj
(
D
));
count
+=
PacketSize
;
}
}
}
else
{
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
// TODO add a vectorized transpose here
Index
w
=
0
;
for
(;
w
<
Pack1
-
3
;
w
+=
4
)
{
Scalar
a
(
cj
(
lhs
(
i
+
w
+
0
,
k
))),
b
(
cj
(
lhs
(
i
+
w
+
1
,
k
))),
c
(
cj
(
lhs
(
i
+
w
+
2
,
k
))),
d
(
cj
(
lhs
(
i
+
w
+
3
,
k
)));
blockA
[
count
++
]
=
a
;
blockA
[
count
++
]
=
b
;
blockA
[
count
++
]
=
c
;
blockA
[
count
++
]
=
d
;
}
if
(
Pack1
%
4
)
for
(;
w
<
Pack1
;
++
w
)
blockA
[
count
++
]
=
cj
(
lhs
(
i
+
w
,
k
));
}
}
if
(
PanelMode
)
count
+=
Pack1
*
(
stride
-
offset
-
depth
);
}
if
(
rows
-
peeled_mc
>=
Pack2
)
{
if
(
PanelMode
)
count
+=
Pack2
*
offset
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
for
(
Index
w
=
0
;
w
<
Pack2
;
w
++
)
blockA
[
count
++
]
=
cj
(
lhs
(
peeled_mc
+
w
,
k
));
if
(
PanelMode
)
count
+=
Pack2
*
(
stride
-
offset
-
depth
);
peeled_mc
+=
Pack2
;
}
for
(
Index
i
=
peeled_mc
;
i
<
rows
;
i
++
)
{
if
(
PanelMode
)
count
+=
offset
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
blockA
[
count
++
]
=
cj
(
lhs
(
i
,
k
));
if
(
PanelMode
)
count
+=
(
stride
-
offset
-
depth
);
}
}
};
// copy a complete panel of the rhs
// this version is optimized for column major matrices
// The traversal order is as follow: (nr==4):
// 0 1 2 3 12 13 14 15 24 27
// 4 5 6 7 16 17 18 19 25 28
// 8 9 10 11 20 21 22 23 26 29
// . . . . . . . . . .
template
<
typename
Scalar
,
typename
Index
,
int
nr
,
bool
Conjugate
,
bool
PanelMode
>
struct
gemm_pack_rhs
<
Scalar
,
Index
,
nr
,
ColMajor
,
Conjugate
,
PanelMode
>
{
typedef
typename
packet_traits
<
Scalar
>::
type
Packet
;
enum
{
PacketSize
=
packet_traits
<
Scalar
>::
size
};
EIGEN_DONT_INLINE
void
operator
()(
Scalar
*
blockB
,
const
Scalar
*
rhs
,
Index
rhsStride
,
Index
depth
,
Index
cols
,
Index
stride
=
0
,
Index
offset
=
0
)
{
EIGEN_ASM_COMMENT
(
"EIGEN PRODUCT PACK RHS COLMAJOR"
);
eigen_assert
(((
!
PanelMode
)
&&
stride
==
0
&&
offset
==
0
)
||
(
PanelMode
&&
stride
>=
depth
&&
offset
<=
stride
));
conj_if
<
NumTraits
<
Scalar
>::
IsComplex
&&
Conjugate
>
cj
;
Index
packet_cols
=
(
cols
/
nr
)
*
nr
;
Index
count
=
0
;
for
(
Index
j2
=
0
;
j2
<
packet_cols
;
j2
+=
nr
)
{
// skip what we have before
if
(
PanelMode
)
count
+=
nr
*
offset
;
const
Scalar
*
b0
=
&
rhs
[(
j2
+
0
)
*
rhsStride
];
const
Scalar
*
b1
=
&
rhs
[(
j2
+
1
)
*
rhsStride
];
const
Scalar
*
b2
=
&
rhs
[(
j2
+
2
)
*
rhsStride
];
const
Scalar
*
b3
=
&
rhs
[(
j2
+
3
)
*
rhsStride
];
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
blockB
[
count
+
0
]
=
cj
(
b0
[
k
]);
blockB
[
count
+
1
]
=
cj
(
b1
[
k
]);
if
(
nr
==
4
)
blockB
[
count
+
2
]
=
cj
(
b2
[
k
]);
if
(
nr
==
4
)
blockB
[
count
+
3
]
=
cj
(
b3
[
k
]);
count
+=
nr
;
}
// skip what we have after
if
(
PanelMode
)
count
+=
nr
*
(
stride
-
offset
-
depth
);
}
// copy the remaining columns one at a time (nr==1)
for
(
Index
j2
=
packet_cols
;
j2
<
cols
;
++
j2
)
{
if
(
PanelMode
)
count
+=
offset
;
const
Scalar
*
b0
=
&
rhs
[(
j2
+
0
)
*
rhsStride
];
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
blockB
[
count
]
=
cj
(
b0
[
k
]);
count
+=
1
;
}
if
(
PanelMode
)
count
+=
(
stride
-
offset
-
depth
);
}
}
};
// this version is optimized for row major matrices
template
<
typename
Scalar
,
typename
Index
,
int
nr
,
bool
Conjugate
,
bool
PanelMode
>
struct
gemm_pack_rhs
<
Scalar
,
Index
,
nr
,
RowMajor
,
Conjugate
,
PanelMode
>
{
enum
{
PacketSize
=
packet_traits
<
Scalar
>::
size
};
EIGEN_DONT_INLINE
void
operator
()(
Scalar
*
blockB
,
const
Scalar
*
rhs
,
Index
rhsStride
,
Index
depth
,
Index
cols
,
Index
stride
=
0
,
Index
offset
=
0
)
{
EIGEN_ASM_COMMENT
(
"EIGEN PRODUCT PACK RHS ROWMAJOR"
);
eigen_assert
(((
!
PanelMode
)
&&
stride
==
0
&&
offset
==
0
)
||
(
PanelMode
&&
stride
>=
depth
&&
offset
<=
stride
));
conj_if
<
NumTraits
<
Scalar
>::
IsComplex
&&
Conjugate
>
cj
;
Index
packet_cols
=
(
cols
/
nr
)
*
nr
;
Index
count
=
0
;
for
(
Index
j2
=
0
;
j2
<
packet_cols
;
j2
+=
nr
)
{
// skip what we have before
if
(
PanelMode
)
count
+=
nr
*
offset
;
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
const
Scalar
*
b0
=
&
rhs
[
k
*
rhsStride
+
j2
];
blockB
[
count
+
0
]
=
cj
(
b0
[
0
]);
blockB
[
count
+
1
]
=
cj
(
b0
[
1
]);
if
(
nr
==
4
)
blockB
[
count
+
2
]
=
cj
(
b0
[
2
]);
if
(
nr
==
4
)
blockB
[
count
+
3
]
=
cj
(
b0
[
3
]);
count
+=
nr
;
}
// skip what we have after
if
(
PanelMode
)
count
+=
nr
*
(
stride
-
offset
-
depth
);
}
// copy the remaining columns one at a time (nr==1)
for
(
Index
j2
=
packet_cols
;
j2
<
cols
;
++
j2
)
{
if
(
PanelMode
)
count
+=
offset
;
const
Scalar
*
b0
=
&
rhs
[
j2
];
for
(
Index
k
=
0
;
k
<
depth
;
k
++
)
{
blockB
[
count
]
=
cj
(
b0
[
k
*
rhsStride
]);
count
+=
1
;
}
if
(
PanelMode
)
count
+=
stride
-
offset
-
depth
;
}
}
};
}
// end namespace internal
/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
* \sa setCpuCacheSize */
inline
std
::
ptrdiff_t
l1CacheSize
()
{
std
::
ptrdiff_t
l1
,
l2
;
internal
::
manage_caching_sizes
(
GetAction
,
&
l1
,
&
l2
);
return
l1
;
}
/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
* \sa setCpuCacheSize */
inline
std
::
ptrdiff_t
l2CacheSize
()
{
std
::
ptrdiff_t
l1
,
l2
;
internal
::
manage_caching_sizes
(
GetAction
,
&
l1
,
&
l2
);
return
l2
;
}
/** Set the cpu L1 and L2 cache sizes (in bytes).
* These values are use to adjust the size of the blocks
* for the algorithms working per blocks.
*
* \sa computeProductBlockingSizes */
inline
void
setCpuCacheSizes
(
std
::
ptrdiff_t
l1
,
std
::
ptrdiff_t
l2
)
{
internal
::
manage_caching_sizes
(
SetAction
,
&
l1
,
&
l2
);
}
}
// end namespace Eigen
#endif // EIGEN_GENERAL_BLOCK_PANEL_H
densecrf/include/Eigen/src/Core/products/GeneralMatrixMatrix.h
0 → 100644
View file @
da2c1226
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
#define EIGEN_GENERAL_MATRIX_MATRIX_H
namespace
Eigen
{
namespace
internal
{
template
<
typename
_LhsScalar
,
typename
_RhsScalar
>
class
level3_blocking
;
/* Specialization for a row-major destination matrix => simple transposition of the product */
template
<
typename
Index
,
typename
LhsScalar
,
int
LhsStorageOrder
,
bool
ConjugateLhs
,
typename
RhsScalar
,
int
RhsStorageOrder
,
bool
ConjugateRhs
>
struct
general_matrix_matrix_product
<
Index
,
LhsScalar
,
LhsStorageOrder
,
ConjugateLhs
,
RhsScalar
,
RhsStorageOrder
,
ConjugateRhs
,
RowMajor
>
{
typedef
typename
scalar_product_traits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
rows
,
Index
cols
,
Index
depth
,
const
LhsScalar
*
lhs
,
Index
lhsStride
,
const
RhsScalar
*
rhs
,
Index
rhsStride
,
ResScalar
*
res
,
Index
resStride
,
ResScalar
alpha
,
level3_blocking
<
RhsScalar
,
LhsScalar
>&
blocking
,
GemmParallelInfo
<
Index
>*
info
=
0
)
{
// transpose the product such that the result is column major
general_matrix_matrix_product
<
Index
,
RhsScalar
,
RhsStorageOrder
==
RowMajor
?
ColMajor
:
RowMajor
,
ConjugateRhs
,
LhsScalar
,
LhsStorageOrder
==
RowMajor
?
ColMajor
:
RowMajor
,
ConjugateLhs
,
ColMajor
>
::
run
(
cols
,
rows
,
depth
,
rhs
,
rhsStride
,
lhs
,
lhsStride
,
res
,
resStride
,
alpha
,
blocking
,
info
);
}
};
/* Specialization for a col-major destination matrix
* => Blocking algorithm following Goto's paper */
template
<
typename
Index
,
typename
LhsScalar
,
int
LhsStorageOrder
,
bool
ConjugateLhs
,
typename
RhsScalar
,
int
RhsStorageOrder
,
bool
ConjugateRhs
>
struct
general_matrix_matrix_product
<
Index
,
LhsScalar
,
LhsStorageOrder
,
ConjugateLhs
,
RhsScalar
,
RhsStorageOrder
,
ConjugateRhs
,
ColMajor
>
{
typedef
typename
scalar_product_traits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
static
void
run
(
Index
rows
,
Index
cols
,
Index
depth
,
const
LhsScalar
*
_lhs
,
Index
lhsStride
,
const
RhsScalar
*
_rhs
,
Index
rhsStride
,
ResScalar
*
res
,
Index
resStride
,
ResScalar
alpha
,
level3_blocking
<
LhsScalar
,
RhsScalar
>&
blocking
,
GemmParallelInfo
<
Index
>*
info
=
0
)
{
const_blas_data_mapper
<
LhsScalar
,
Index
,
LhsStorageOrder
>
lhs
(
_lhs
,
lhsStride
);
const_blas_data_mapper
<
RhsScalar
,
Index
,
RhsStorageOrder
>
rhs
(
_rhs
,
rhsStride
);
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
>
Traits
;
Index
kc
=
blocking
.
kc
();
// cache block size along the K direction
Index
mc
=
(
std
::
min
)(
rows
,
blocking
.
mc
());
// cache block size along the M direction
//Index nc = blocking.nc(); // cache block size along the N direction
gemm_pack_lhs
<
LhsScalar
,
Index
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
>
pack_lhs
;
gemm_pack_rhs
<
RhsScalar
,
Index
,
Traits
::
nr
,
RhsStorageOrder
>
pack_rhs
;
gebp_kernel
<
LhsScalar
,
RhsScalar
,
Index
,
Traits
::
mr
,
Traits
::
nr
,
ConjugateLhs
,
ConjugateRhs
>
gebp
;
#ifdef EIGEN_HAS_OPENMP
if
(
info
)
{
// this is the parallel version!
Index
tid
=
omp_get_thread_num
();
Index
threads
=
omp_get_num_threads
();
std
::
size_t
sizeA
=
kc
*
mc
;
std
::
size_t
sizeW
=
kc
*
Traits
::
WorkSpaceFactor
;
ei_declare_aligned_stack_constructed_variable
(
LhsScalar
,
blockA
,
sizeA
,
0
);
ei_declare_aligned_stack_constructed_variable
(
RhsScalar
,
w
,
sizeW
,
0
);
RhsScalar
*
blockB
=
blocking
.
blockB
();
eigen_internal_assert
(
blockB
!=
0
);
// For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
for
(
Index
k
=
0
;
k
<
depth
;
k
+=
kc
)
{
const
Index
actual_kc
=
(
std
::
min
)(
k
+
kc
,
depth
)
-
k
;
// => rows of B', and cols of the A'
// In order to reduce the chance that a thread has to wait for the other,
// let's start by packing A'.
pack_lhs
(
blockA
,
&
lhs
(
0
,
k
),
lhsStride
,
actual_kc
,
mc
);
// Pack B_k to B' in a parallel fashion:
// each thread packs the sub block B_k,j to B'_j where j is the thread id.
// However, before copying to B'_j, we have to make sure that no other thread is still using it,
// i.e., we test that info[tid].users equals 0.
// Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
while
(
info
[
tid
].
users
!=
0
)
{}
info
[
tid
].
users
+=
threads
;
pack_rhs
(
blockB
+
info
[
tid
].
rhs_start
*
actual_kc
,
&
rhs
(
k
,
info
[
tid
].
rhs_start
),
rhsStride
,
actual_kc
,
info
[
tid
].
rhs_length
);
// Notify the other threads that the part B'_j is ready to go.
info
[
tid
].
sync
=
k
;
// Computes C_i += A' * B' per B'_j
for
(
Index
shift
=
0
;
shift
<
threads
;
++
shift
)
{
Index
j
=
(
tid
+
shift
)
%
threads
;
// At this point we have to make sure that B'_j has been updated by the thread j,
// we use testAndSetOrdered to mimic a volatile access.
// However, no need to wait for the B' part which has been updated by the current thread!
if
(
shift
>
0
)
while
(
info
[
j
].
sync
!=
k
)
{}
gebp
(
res
+
info
[
j
].
rhs_start
*
resStride
,
resStride
,
blockA
,
blockB
+
info
[
j
].
rhs_start
*
actual_kc
,
mc
,
actual_kc
,
info
[
j
].
rhs_length
,
alpha
,
-
1
,
-
1
,
0
,
0
,
w
);
}
// Then keep going as usual with the remaining A'
for
(
Index
i
=
mc
;
i
<
rows
;
i
+=
mc
)
{
const
Index
actual_mc
=
(
std
::
min
)(
i
+
mc
,
rows
)
-
i
;
// pack A_i,k to A'
pack_lhs
(
blockA
,
&
lhs
(
i
,
k
),
lhsStride
,
actual_kc
,
actual_mc
);
// C_i += A' * B'
gebp
(
res
+
i
,
resStride
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
,
-
1
,
-
1
,
0
,
0
,
w
);
}
// Release all the sub blocks B'_j of B' for the current thread,
// i.e., we simply decrement the number of users by 1
for
(
Index
j
=
0
;
j
<
threads
;
++
j
)
#pragma omp atomic
--
(
info
[
j
].
users
);
}
}
else
#endif // EIGEN_HAS_OPENMP
{
EIGEN_UNUSED_VARIABLE
(
info
);
// this is the sequential version!
std
::
size_t
sizeA
=
kc
*
mc
;
std
::
size_t
sizeB
=
kc
*
cols
;
std
::
size_t
sizeW
=
kc
*
Traits
::
WorkSpaceFactor
;
ei_declare_aligned_stack_constructed_variable
(
LhsScalar
,
blockA
,
sizeA
,
blocking
.
blockA
());
ei_declare_aligned_stack_constructed_variable
(
RhsScalar
,
blockB
,
sizeB
,
blocking
.
blockB
());
ei_declare_aligned_stack_constructed_variable
(
RhsScalar
,
blockW
,
sizeW
,
blocking
.
blockW
());
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
// (==GEMM_VAR1)
for
(
Index
k2
=
0
;
k2
<
depth
;
k2
+=
kc
)
{
const
Index
actual_kc
=
(
std
::
min
)(
k2
+
kc
,
depth
)
-
k2
;
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
// => Pack rhs's panel into a sequential chunk of memory (L2 caching)
// Note that this panel will be read as many times as the number of blocks in the lhs's
// vertical panel which is, in practice, a very low number.
pack_rhs
(
blockB
,
&
rhs
(
k2
,
0
),
rhsStride
,
actual_kc
,
cols
);
// For each mc x kc block of the lhs's vertical panel...
// (==GEPP_VAR1)
for
(
Index
i2
=
0
;
i2
<
rows
;
i2
+=
mc
)
{
const
Index
actual_mc
=
(
std
::
min
)(
i2
+
mc
,
rows
)
-
i2
;
// We pack the lhs's block into a sequential chunk of memory (L1 caching)
// Note that this block will be read a very high number of times, which is equal to the number of
// micro vertical panel of the large rhs's panel (e.g., cols/4 times).
pack_lhs
(
blockA
,
&
lhs
(
i2
,
k2
),
lhsStride
,
actual_kc
,
actual_mc
);
// Everything is packed, we can now call the block * panel kernel:
gebp
(
res
+
i2
,
resStride
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
,
-
1
,
-
1
,
0
,
0
,
blockW
);
}
}
}
}
};
/*********************************************************************************
* Specialization of GeneralProduct<> for "large" GEMM, i.e.,
* implementation of the high level wrapper to general_matrix_matrix_product
**********************************************************************************/
template
<
typename
Lhs
,
typename
Rhs
>
struct
traits
<
GeneralProduct
<
Lhs
,
Rhs
,
GemmProduct
>
>
:
traits
<
ProductBase
<
GeneralProduct
<
Lhs
,
Rhs
,
GemmProduct
>
,
Lhs
,
Rhs
>
>
{};
template
<
typename
Scalar
,
typename
Index
,
typename
Gemm
,
typename
Lhs
,
typename
Rhs
,
typename
Dest
,
typename
BlockingType
>
struct
gemm_functor
{
gemm_functor
(
const
Lhs
&
lhs
,
const
Rhs
&
rhs
,
Dest
&
dest
,
Scalar
actualAlpha
,
BlockingType
&
blocking
)
:
m_lhs
(
lhs
),
m_rhs
(
rhs
),
m_dest
(
dest
),
m_actualAlpha
(
actualAlpha
),
m_blocking
(
blocking
)
{}
void
initParallelSession
()
const
{
m_blocking
.
allocateB
();
}
void
operator
()
(
Index
row
,
Index
rows
,
Index
col
=
0
,
Index
cols
=-
1
,
GemmParallelInfo
<
Index
>*
info
=
0
)
const
{
if
(
cols
==-
1
)
cols
=
m_rhs
.
cols
();
Gemm
::
run
(
rows
,
cols
,
m_lhs
.
cols
(),
/*(const Scalar*)*/
&
m_lhs
.
coeffRef
(
row
,
0
),
m_lhs
.
outerStride
(),
/*(const Scalar*)*/
&
m_rhs
.
coeffRef
(
0
,
col
),
m_rhs
.
outerStride
(),
(
Scalar
*
)
&
(
m_dest
.
coeffRef
(
row
,
col
)),
m_dest
.
outerStride
(),
m_actualAlpha
,
m_blocking
,
info
);
}
protected:
const
Lhs
&
m_lhs
;
const
Rhs
&
m_rhs
;
Dest
&
m_dest
;
Scalar
m_actualAlpha
;
BlockingType
&
m_blocking
;
};
template
<
int
StorageOrder
,
typename
LhsScalar
,
typename
RhsScalar
,
int
MaxRows
,
int
MaxCols
,
int
MaxDepth
,
int
KcFactor
=
1
,
bool
FiniteAtCompileTime
=
MaxRows
!=
Dynamic
&&
MaxCols
!=
Dynamic
&&
MaxDepth
!=
Dynamic
>
class
gemm_blocking_space
;
template
<
typename
_LhsScalar
,
typename
_RhsScalar
>
class
level3_blocking
{
typedef
_LhsScalar
LhsScalar
;
typedef
_RhsScalar
RhsScalar
;
protected:
LhsScalar
*
m_blockA
;
RhsScalar
*
m_blockB
;
RhsScalar
*
m_blockW
;
DenseIndex
m_mc
;
DenseIndex
m_nc
;
DenseIndex
m_kc
;
public:
level3_blocking
()
:
m_blockA
(
0
),
m_blockB
(
0
),
m_blockW
(
0
),
m_mc
(
0
),
m_nc
(
0
),
m_kc
(
0
)
{}
inline
DenseIndex
mc
()
const
{
return
m_mc
;
}
inline
DenseIndex
nc
()
const
{
return
m_nc
;
}
inline
DenseIndex
kc
()
const
{
return
m_kc
;
}
inline
LhsScalar
*
blockA
()
{
return
m_blockA
;
}
inline
RhsScalar
*
blockB
()
{
return
m_blockB
;
}
inline
RhsScalar
*
blockW
()
{
return
m_blockW
;
}
};
template
<
int
StorageOrder
,
typename
_LhsScalar
,
typename
_RhsScalar
,
int
MaxRows
,
int
MaxCols
,
int
MaxDepth
,
int
KcFactor
>
class
gemm_blocking_space
<
StorageOrder
,
_LhsScalar
,
_RhsScalar
,
MaxRows
,
MaxCols
,
MaxDepth
,
KcFactor
,
true
>
:
public
level3_blocking
<
typename
conditional
<
StorageOrder
==
RowMajor
,
_RhsScalar
,
_LhsScalar
>::
type
,
typename
conditional
<
StorageOrder
==
RowMajor
,
_LhsScalar
,
_RhsScalar
>::
type
>
{
enum
{
Transpose
=
StorageOrder
==
RowMajor
,
ActualRows
=
Transpose
?
MaxCols
:
MaxRows
,
ActualCols
=
Transpose
?
MaxRows
:
MaxCols
};
typedef
typename
conditional
<
Transpose
,
_RhsScalar
,
_LhsScalar
>::
type
LhsScalar
;
typedef
typename
conditional
<
Transpose
,
_LhsScalar
,
_RhsScalar
>::
type
RhsScalar
;
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
>
Traits
;
enum
{
SizeA
=
ActualRows
*
MaxDepth
,
SizeB
=
ActualCols
*
MaxDepth
,
SizeW
=
MaxDepth
*
Traits
::
WorkSpaceFactor
};
EIGEN_ALIGN16
LhsScalar
m_staticA
[
SizeA
];
EIGEN_ALIGN16
RhsScalar
m_staticB
[
SizeB
];
EIGEN_ALIGN16
RhsScalar
m_staticW
[
SizeW
];
public:
gemm_blocking_space
(
DenseIndex
/*rows*/
,
DenseIndex
/*cols*/
,
DenseIndex
/*depth*/
)
{
this
->
m_mc
=
ActualRows
;
this
->
m_nc
=
ActualCols
;
this
->
m_kc
=
MaxDepth
;
this
->
m_blockA
=
m_staticA
;
this
->
m_blockB
=
m_staticB
;
this
->
m_blockW
=
m_staticW
;
}
inline
void
allocateA
()
{}
inline
void
allocateB
()
{}
inline
void
allocateW
()
{}
inline
void
allocateAll
()
{}
};
template
<
int
StorageOrder
,
typename
_LhsScalar
,
typename
_RhsScalar
,
int
MaxRows
,
int
MaxCols
,
int
MaxDepth
,
int
KcFactor
>
class
gemm_blocking_space
<
StorageOrder
,
_LhsScalar
,
_RhsScalar
,
MaxRows
,
MaxCols
,
MaxDepth
,
KcFactor
,
false
>
:
public
level3_blocking
<
typename
conditional
<
StorageOrder
==
RowMajor
,
_RhsScalar
,
_LhsScalar
>::
type
,
typename
conditional
<
StorageOrder
==
RowMajor
,
_LhsScalar
,
_RhsScalar
>::
type
>
{
enum
{
Transpose
=
StorageOrder
==
RowMajor
};
typedef
typename
conditional
<
Transpose
,
_RhsScalar
,
_LhsScalar
>::
type
LhsScalar
;
typedef
typename
conditional
<
Transpose
,
_LhsScalar
,
_RhsScalar
>::
type
RhsScalar
;
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
>
Traits
;
DenseIndex
m_sizeA
;
DenseIndex
m_sizeB
;
DenseIndex
m_sizeW
;
public:
gemm_blocking_space
(
DenseIndex
rows
,
DenseIndex
cols
,
DenseIndex
depth
)
{
this
->
m_mc
=
Transpose
?
cols
:
rows
;
this
->
m_nc
=
Transpose
?
rows
:
cols
;
this
->
m_kc
=
depth
;
computeProductBlockingSizes
<
LhsScalar
,
RhsScalar
,
KcFactor
>
(
this
->
m_kc
,
this
->
m_mc
,
this
->
m_nc
);
m_sizeA
=
this
->
m_mc
*
this
->
m_kc
;
m_sizeB
=
this
->
m_kc
*
this
->
m_nc
;
m_sizeW
=
this
->
m_kc
*
Traits
::
WorkSpaceFactor
;
}
void
allocateA
()
{
if
(
this
->
m_blockA
==
0
)
this
->
m_blockA
=
aligned_new
<
LhsScalar
>
(
m_sizeA
);
}
void
allocateB
()
{
if
(
this
->
m_blockB
==
0
)
this
->
m_blockB
=
aligned_new
<
RhsScalar
>
(
m_sizeB
);
}
void
allocateW
()
{
if
(
this
->
m_blockW
==
0
)
this
->
m_blockW
=
aligned_new
<
RhsScalar
>
(
m_sizeW
);
}
void
allocateAll
()
{
allocateA
();
allocateB
();
allocateW
();
}
~
gemm_blocking_space
()
{
aligned_delete
(
this
->
m_blockA
,
m_sizeA
);
aligned_delete
(
this
->
m_blockB
,
m_sizeB
);
aligned_delete
(
this
->
m_blockW
,
m_sizeW
);
}
};
}
// end namespace internal
template
<
typename
Lhs
,
typename
Rhs
>
class
GeneralProduct
<
Lhs
,
Rhs
,
GemmProduct
>
:
public
ProductBase
<
GeneralProduct
<
Lhs
,
Rhs
,
GemmProduct
>
,
Lhs
,
Rhs
>
{
enum
{
MaxDepthAtCompileTime
=
EIGEN_SIZE_MIN_PREFER_FIXED
(
Lhs
::
MaxColsAtCompileTime
,
Rhs
::
MaxRowsAtCompileTime
)
};
public:
EIGEN_PRODUCT_PUBLIC_INTERFACE
(
GeneralProduct
)
typedef
typename
Lhs
::
Scalar
LhsScalar
;
typedef
typename
Rhs
::
Scalar
RhsScalar
;
typedef
Scalar
ResScalar
;
GeneralProduct
(
const
Lhs
&
lhs
,
const
Rhs
&
rhs
)
:
Base
(
lhs
,
rhs
)
{
typedef
internal
::
scalar_product_op
<
LhsScalar
,
RhsScalar
>
BinOp
;
EIGEN_CHECK_BINARY_COMPATIBILIY
(
BinOp
,
LhsScalar
,
RhsScalar
);
}
template
<
typename
Dest
>
void
scaleAndAddTo
(
Dest
&
dst
,
Scalar
alpha
)
const
{
eigen_assert
(
dst
.
rows
()
==
m_lhs
.
rows
()
&&
dst
.
cols
()
==
m_rhs
.
cols
());
typename
internal
::
add_const_on_value_type
<
ActualLhsType
>::
type
lhs
=
LhsBlasTraits
::
extract
(
m_lhs
);
typename
internal
::
add_const_on_value_type
<
ActualRhsType
>::
type
rhs
=
RhsBlasTraits
::
extract
(
m_rhs
);
Scalar
actualAlpha
=
alpha
*
LhsBlasTraits
::
extractScalarFactor
(
m_lhs
)
*
RhsBlasTraits
::
extractScalarFactor
(
m_rhs
);
typedef
internal
::
gemm_blocking_space
<
(
Dest
::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
LhsScalar
,
RhsScalar
,
Dest
::
MaxRowsAtCompileTime
,
Dest
::
MaxColsAtCompileTime
,
MaxDepthAtCompileTime
>
BlockingType
;
typedef
internal
::
gemm_functor
<
Scalar
,
Index
,
internal
::
general_matrix_matrix_product
<
Index
,
LhsScalar
,
(
_ActualLhsType
::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
bool
(
LhsBlasTraits
::
NeedToConjugate
),
RhsScalar
,
(
_ActualRhsType
::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
bool
(
RhsBlasTraits
::
NeedToConjugate
),
(
Dest
::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
>
,
_ActualLhsType
,
_ActualRhsType
,
Dest
,
BlockingType
>
GemmFunctor
;
BlockingType
blocking
(
dst
.
rows
(),
dst
.
cols
(),
lhs
.
cols
());
internal
::
parallelize_gemm
<
(
Dest
::
MaxRowsAtCompileTime
>
32
||
Dest
::
MaxRowsAtCompileTime
==
Dynamic
)
>
(
GemmFunctor
(
lhs
,
rhs
,
dst
,
actualAlpha
,
blocking
),
this
->
rows
(),
this
->
cols
(),
Dest
::
Flags
&
RowMajorBit
);
}
};
}
// end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
densecrf/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
0 → 100644
View file @
da2c1226
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
namespace
Eigen
{
namespace
internal
{
/**********************************************************************
* This file implements a general A * B product while
* evaluating only one triangular part of the product.
* This is more general version of self adjoint product (C += A A^T)
* as the level 3 SYRK Blas routine.
**********************************************************************/
// forward declarations (defined at the end of this file)
template
<
typename
LhsScalar
,
typename
RhsScalar
,
typename
Index
,
int
mr
,
int
nr
,
bool
ConjLhs
,
bool
ConjRhs
,
int
UpLo
>
struct
tribb_kernel
;
/* Optimized matrix-matrix product evaluating only one triangular half */
template
<
typename
Index
,
typename
LhsScalar
,
int
LhsStorageOrder
,
bool
ConjugateLhs
,
typename
RhsScalar
,
int
RhsStorageOrder
,
bool
ConjugateRhs
,
int
ResStorageOrder
,
int
UpLo
,
int
Version
=
Specialized
>
struct
general_matrix_matrix_triangular_product
;
// as usual if the result is row major => we transpose the product
template
<
typename
Index
,
typename
LhsScalar
,
int
LhsStorageOrder
,
bool
ConjugateLhs
,
typename
RhsScalar
,
int
RhsStorageOrder
,
bool
ConjugateRhs
,
int
UpLo
,
int
Version
>
struct
general_matrix_matrix_triangular_product
<
Index
,
LhsScalar
,
LhsStorageOrder
,
ConjugateLhs
,
RhsScalar
,
RhsStorageOrder
,
ConjugateRhs
,
RowMajor
,
UpLo
,
Version
>
{
typedef
typename
scalar_product_traits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
size
,
Index
depth
,
const
LhsScalar
*
lhs
,
Index
lhsStride
,
const
RhsScalar
*
rhs
,
Index
rhsStride
,
ResScalar
*
res
,
Index
resStride
,
ResScalar
alpha
)
{
general_matrix_matrix_triangular_product
<
Index
,
RhsScalar
,
RhsStorageOrder
==
RowMajor
?
ColMajor
:
RowMajor
,
ConjugateRhs
,
LhsScalar
,
LhsStorageOrder
==
RowMajor
?
ColMajor
:
RowMajor
,
ConjugateLhs
,
ColMajor
,
UpLo
==
Lower
?
Upper
:
Lower
>
::
run
(
size
,
depth
,
rhs
,
rhsStride
,
lhs
,
lhsStride
,
res
,
resStride
,
alpha
);
}
};
template
<
typename
Index
,
typename
LhsScalar
,
int
LhsStorageOrder
,
bool
ConjugateLhs
,
typename
RhsScalar
,
int
RhsStorageOrder
,
bool
ConjugateRhs
,
int
UpLo
,
int
Version
>
struct
general_matrix_matrix_triangular_product
<
Index
,
LhsScalar
,
LhsStorageOrder
,
ConjugateLhs
,
RhsScalar
,
RhsStorageOrder
,
ConjugateRhs
,
ColMajor
,
UpLo
,
Version
>
{
typedef
typename
scalar_product_traits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
static
EIGEN_STRONG_INLINE
void
run
(
Index
size
,
Index
depth
,
const
LhsScalar
*
_lhs
,
Index
lhsStride
,
const
RhsScalar
*
_rhs
,
Index
rhsStride
,
ResScalar
*
res
,
Index
resStride
,
ResScalar
alpha
)
{
const_blas_data_mapper
<
LhsScalar
,
Index
,
LhsStorageOrder
>
lhs
(
_lhs
,
lhsStride
);
const_blas_data_mapper
<
RhsScalar
,
Index
,
RhsStorageOrder
>
rhs
(
_rhs
,
rhsStride
);
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
>
Traits
;
Index
kc
=
depth
;
// cache block size along the K direction
Index
mc
=
size
;
// cache block size along the M direction
Index
nc
=
size
;
// cache block size along the N direction
computeProductBlockingSizes
<
LhsScalar
,
RhsScalar
>
(
kc
,
mc
,
nc
);
// !!! mc must be a multiple of nr:
if
(
mc
>
Traits
::
nr
)
mc
=
(
mc
/
Traits
::
nr
)
*
Traits
::
nr
;
std
::
size_t
sizeW
=
kc
*
Traits
::
WorkSpaceFactor
;
std
::
size_t
sizeB
=
sizeW
+
kc
*
size
;
ei_declare_aligned_stack_constructed_variable
(
LhsScalar
,
blockA
,
kc
*
mc
,
0
);
ei_declare_aligned_stack_constructed_variable
(
RhsScalar
,
allocatedBlockB
,
sizeB
,
0
);
RhsScalar
*
blockB
=
allocatedBlockB
+
sizeW
;
gemm_pack_lhs
<
LhsScalar
,
Index
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
>
pack_lhs
;
gemm_pack_rhs
<
RhsScalar
,
Index
,
Traits
::
nr
,
RhsStorageOrder
>
pack_rhs
;
gebp_kernel
<
LhsScalar
,
RhsScalar
,
Index
,
Traits
::
mr
,
Traits
::
nr
,
ConjugateLhs
,
ConjugateRhs
>
gebp
;
tribb_kernel
<
LhsScalar
,
RhsScalar
,
Index
,
Traits
::
mr
,
Traits
::
nr
,
ConjugateLhs
,
ConjugateRhs
,
UpLo
>
sybb
;
for
(
Index
k2
=
0
;
k2
<
depth
;
k2
+=
kc
)
{
const
Index
actual_kc
=
(
std
::
min
)(
k2
+
kc
,
depth
)
-
k2
;
// note that the actual rhs is the transpose/adjoint of mat
pack_rhs
(
blockB
,
&
rhs
(
k2
,
0
),
rhsStride
,
actual_kc
,
size
);
for
(
Index
i2
=
0
;
i2
<
size
;
i2
+=
mc
)
{
const
Index
actual_mc
=
(
std
::
min
)(
i2
+
mc
,
size
)
-
i2
;
pack_lhs
(
blockA
,
&
lhs
(
i2
,
k2
),
lhsStride
,
actual_kc
,
actual_mc
);
// the selected actual_mc * size panel of res is split into three different part:
// 1 - before the diagonal => processed with gebp or skipped
// 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
// 3 - after the diagonal => processed with gebp or skipped
if
(
UpLo
==
Lower
)
gebp
(
res
+
i2
,
resStride
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
(
std
::
min
)(
size
,
i2
),
alpha
,
-
1
,
-
1
,
0
,
0
,
allocatedBlockB
);
sybb
(
res
+
resStride
*
i2
+
i2
,
resStride
,
blockA
,
blockB
+
actual_kc
*
i2
,
actual_mc
,
actual_kc
,
alpha
,
allocatedBlockB
);
if
(
UpLo
==
Upper
)
{
Index
j2
=
i2
+
actual_mc
;
gebp
(
res
+
resStride
*
j2
+
i2
,
resStride
,
blockA
,
blockB
+
actual_kc
*
j2
,
actual_mc
,
actual_kc
,
(
std
::
max
)(
Index
(
0
),
size
-
j2
),
alpha
,
-
1
,
-
1
,
0
,
0
,
allocatedBlockB
);
}
}
}
}
};
// Optimized packed Block * packed Block product kernel evaluating only one given triangular part
// This kernel is built on top of the gebp kernel:
// - the current destination block is processed per panel of actual_mc x BlockSize
// where BlockSize is set to the minimal value allowing gebp to be as fast as possible
// - then, as usual, each panel is split into three parts along the diagonal,
// the sub blocks above and below the diagonal are processed as usual,
// while the triangular block overlapping the diagonal is evaluated into a
// small temporary buffer which is then accumulated into the result using a
// triangular traversal.
template
<
typename
LhsScalar
,
typename
RhsScalar
,
typename
Index
,
int
mr
,
int
nr
,
bool
ConjLhs
,
bool
ConjRhs
,
int
UpLo
>
struct
tribb_kernel
{
typedef
gebp_traits
<
LhsScalar
,
RhsScalar
,
ConjLhs
,
ConjRhs
>
Traits
;
typedef
typename
Traits
::
ResScalar
ResScalar
;
enum
{
BlockSize
=
EIGEN_PLAIN_ENUM_MAX
(
mr
,
nr
)
};
void
operator
()(
ResScalar
*
res
,
Index
resStride
,
const
LhsScalar
*
blockA
,
const
RhsScalar
*
blockB
,
Index
size
,
Index
depth
,
ResScalar
alpha
,
RhsScalar
*
workspace
)
{
gebp_kernel
<
LhsScalar
,
RhsScalar
,
Index
,
mr
,
nr
,
ConjLhs
,
ConjRhs
>
gebp_kernel
;
Matrix
<
ResScalar
,
BlockSize
,
BlockSize
,
ColMajor
>
buffer
;
// let's process the block per panel of actual_mc x BlockSize,
// again, each is split into three parts, etc.
for
(
Index
j
=
0
;
j
<
size
;
j
+=
BlockSize
)
{
Index
actualBlockSize
=
std
::
min
<
Index
>
(
BlockSize
,
size
-
j
);
const
RhsScalar
*
actual_b
=
blockB
+
j
*
depth
;
if
(
UpLo
==
Upper
)
gebp_kernel
(
res
+
j
*
resStride
,
resStride
,
blockA
,
actual_b
,
j
,
depth
,
actualBlockSize
,
alpha
,
-
1
,
-
1
,
0
,
0
,
workspace
);
// selfadjoint micro block
{
Index
i
=
j
;
buffer
.
setZero
();
// 1 - apply the kernel on the temporary buffer
gebp_kernel
(
buffer
.
data
(),
BlockSize
,
blockA
+
depth
*
i
,
actual_b
,
actualBlockSize
,
depth
,
actualBlockSize
,
alpha
,
-
1
,
-
1
,
0
,
0
,
workspace
);
// 2 - triangular accumulation
for
(
Index
j1
=
0
;
j1
<
actualBlockSize
;
++
j1
)
{
ResScalar
*
r
=
res
+
(
j
+
j1
)
*
resStride
+
i
;
for
(
Index
i1
=
UpLo
==
Lower
?
j1
:
0
;
UpLo
==
Lower
?
i1
<
actualBlockSize
:
i1
<=
j1
;
++
i1
)
r
[
i1
]
+=
buffer
(
i1
,
j1
);
}
}
if
(
UpLo
==
Lower
)
{
Index
i
=
j
+
actualBlockSize
;
gebp_kernel
(
res
+
j
*
resStride
+
i
,
resStride
,
blockA
+
depth
*
i
,
actual_b
,
size
-
i
,
depth
,
actualBlockSize
,
alpha
,
-
1
,
-
1
,
0
,
0
,
workspace
);
}
}
}
};
}
// end namespace internal
// high level API
template
<
typename
MatrixType
,
unsigned
int
UpLo
>
template
<
typename
ProductDerived
,
typename
_Lhs
,
typename
_Rhs
>
TriangularView
<
MatrixType
,
UpLo
>&
TriangularView
<
MatrixType
,
UpLo
>::
assignProduct
(
const
ProductBase
<
ProductDerived
,
_Lhs
,
_Rhs
>&
prod
,
const
Scalar
&
alpha
)
{
typedef
typename
internal
::
remove_all
<
typename
ProductDerived
::
LhsNested
>::
type
Lhs
;
typedef
internal
::
blas_traits
<
Lhs
>
LhsBlasTraits
;
typedef
typename
LhsBlasTraits
::
DirectLinearAccessType
ActualLhs
;
typedef
typename
internal
::
remove_all
<
ActualLhs
>::
type
_ActualLhs
;
typename
internal
::
add_const_on_value_type
<
ActualLhs
>::
type
actualLhs
=
LhsBlasTraits
::
extract
(
prod
.
lhs
());
typedef
typename
internal
::
remove_all
<
typename
ProductDerived
::
RhsNested
>::
type
Rhs
;
typedef
internal
::
blas_traits
<
Rhs
>
RhsBlasTraits
;
typedef
typename
RhsBlasTraits
::
DirectLinearAccessType
ActualRhs
;
typedef
typename
internal
::
remove_all
<
ActualRhs
>::
type
_ActualRhs
;
typename
internal
::
add_const_on_value_type
<
ActualRhs
>::
type
actualRhs
=
RhsBlasTraits
::
extract
(
prod
.
rhs
());
typename
ProductDerived
::
Scalar
actualAlpha
=
alpha
*
LhsBlasTraits
::
extractScalarFactor
(
prod
.
lhs
().
derived
())
*
RhsBlasTraits
::
extractScalarFactor
(
prod
.
rhs
().
derived
());
internal
::
general_matrix_matrix_triangular_product
<
Index
,
typename
Lhs
::
Scalar
,
_ActualLhs
::
Flags
&
RowMajorBit
?
RowMajor
:
ColMajor
,
LhsBlasTraits
::
NeedToConjugate
,
typename
Rhs
::
Scalar
,
_ActualRhs
::
Flags
&
RowMajorBit
?
RowMajor
:
ColMajor
,
RhsBlasTraits
::
NeedToConjugate
,
MatrixType
::
Flags
&
RowMajorBit
?
RowMajor
:
ColMajor
,
UpLo
>
::
run
(
m_matrix
.
cols
(),
actualLhs
.
cols
(),
&
actualLhs
.
coeffRef
(
0
,
0
),
actualLhs
.
outerStride
(),
&
actualRhs
.
coeffRef
(
0
,
0
),
actualRhs
.
outerStride
(),
const_cast
<
Scalar
*>
(
m_matrix
.
data
()),
m_matrix
.
outerStride
(),
actualAlpha
);
return
*
this
;
}
}
// end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
densecrf/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
0 → 100644
View file @
da2c1226
/*
Copyright (c) 2011, Intel Corporation. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors may
be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************************
* Content : Eigen bindings to Intel(R) MKL
* Level 3 BLAS SYRK/HERK implementation.
********************************************************************************
*/
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
namespace
Eigen
{
namespace
internal
{
template
<
typename
Index
,
typename
Scalar
,
int
AStorageOrder
,
bool
ConjugateA
,
int
ResStorageOrder
,
int
UpLo
>
struct
general_matrix_matrix_rankupdate
:
general_matrix_matrix_triangular_product
<
Index
,
Scalar
,
AStorageOrder
,
ConjugateA
,
Scalar
,
AStorageOrder
,
ConjugateA
,
ResStorageOrder
,
UpLo
,
BuiltIn
>
{};
// try to go to BLAS specialization
#define EIGEN_MKL_RANKUPDATE_SPECIALIZE(Scalar) \
template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs, int UpLo> \
struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \
{ \
if (lhs==rhs) { \
general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
} else { \
general_matrix_matrix_triangular_product<Index, \
Scalar, LhsStorageOrder, ConjugateLhs, \
Scalar, RhsStorageOrder, ConjugateRhs, \
ColMajor, UpLo, BuiltIn> \
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
} \
} \
};
EIGEN_MKL_RANKUPDATE_SPECIALIZE
(
double
)
//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex)
EIGEN_MKL_RANKUPDATE_SPECIALIZE
(
float
)
//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex)
// SYRK for float/double
#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \
template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \
struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
enum { \
IsLower = (UpLo&Lower) == Lower, \
LowUp = IsLower ? Lower : Upper, \
conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \
}; \
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
{ \
/* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/
\
\
MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \
MKLTYPE alpha_, beta_; \
\
/* Set alpha_ & beta_ */
\
assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \
} \
};
// HERK for complex data
#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, MKLTYPE, RTYPE, MKLFUNC) \
template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \
struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
enum { \
IsLower = (UpLo&Lower) == Lower, \
LowUp = IsLower ? Lower : Upper, \
conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \
}; \
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
{ \
typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
\
MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \
RTYPE alpha_, beta_; \
const EIGTYPE* a_ptr; \
\
/* Set alpha_ & beta_ */
\
/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); */
\
/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1));*/
\
alpha_ = alpha.real(); \
beta_ = 1.0; \
/* Copy with conjugation in some cases*/
\
MatrixType a; \
if (conjA) { \
Map<const MatrixType, 0, OuterStride<> > mapA(lhs,n,k,OuterStride<>(lhsStride)); \
a = mapA.conjugate(); \
lda = a.outerStride(); \
a_ptr = a.data(); \
} else a_ptr=lhs; \
MKLFUNC(&uplo, &trans, &n, &k, &alpha_, (MKLTYPE*)a_ptr, &lda, &beta_, (MKLTYPE*)res, &ldc); \
} \
};
EIGEN_MKL_RANKUPDATE_R
(
double
,
double
,
dsyrk
)
EIGEN_MKL_RANKUPDATE_R
(
float
,
float
,
ssyrk
)
//EIGEN_MKL_RANKUPDATE_C(dcomplex, MKL_Complex16, double, zherk)
//EIGEN_MKL_RANKUPDATE_C(scomplex, MKL_Complex8, double, cherk)
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
densecrf/include/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
0 → 100644
View file @
da2c1226
/*
Copyright (c) 2011, Intel Corporation. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors may
be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************************
* Content : Eigen bindings to Intel(R) MKL
* General matrix-matrix product functionality based on ?GEMM.
********************************************************************************
*/
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
#define EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
namespace
Eigen
{
namespace
internal
{
/**********************************************************************
* This file implements general matrix-matrix multiplication using BLAS
* gemm function via partial specialization of
* general_matrix_matrix_product::run(..) method for float, double,
* std::complex<float> and std::complex<double> types
**********************************************************************/
// gemm specialization
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, MKLTYPE, MKLPREFIX) \
template< \
typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
{ \
static void run(Index rows, Index cols, Index depth, \
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \
EIGTYPE alpha, \
level3_blocking<EIGTYPE, EIGTYPE>&
/*blocking*/
, \
GemmParallelInfo<Index>*
/*info = 0*/
) \
{ \
using std::conj; \
\
char transa, transb; \
MKL_INT m, n, k, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKLTYPE alpha_, beta_; \
MatrixX##EIGPREFIX a_tmp, b_tmp; \
EIGTYPE myone(1);\
\
/* Set transpose options */
\
transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
\
/* Set m, n, k */
\
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
k = (MKL_INT)depth; \
\
/* Set alpha_ & beta_ */
\
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
\
/* Set lda, ldb, ldc */
\
lda = (MKL_INT)lhsStride; \
ldb = (MKL_INT)rhsStride; \
ldc = (MKL_INT)resStride; \
\
/* Set a, b, c */
\
if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
a_tmp = lhs.conjugate(); \
a = a_tmp.data(); \
lda = a_tmp.outerStride(); \
} else a = _lhs; \
\
if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
b_tmp = rhs.conjugate(); \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
} else b = _rhs; \
\
MKLPREFIX##gemm(&transa, &transb, &m, &n, &k, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
}};
GEMM_SPECIALIZATION
(
double
,
d
,
double
,
d
)
GEMM_SPECIALIZATION
(
float
,
f
,
float
,
s
)
GEMM_SPECIALIZATION
(
dcomplex
,
cd
,
MKL_Complex16
,
z
)
GEMM_SPECIALIZATION
(
scomplex
,
cf
,
MKL_Complex8
,
c
)
}
// end namespase internal
}
// end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
densecrf/include/Eigen/src/Core/products/GeneralMatrixVector.h
0 → 100644
View file @
da2c1226
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
#define EIGEN_GENERAL_MATRIX_VECTOR_H
namespace
Eigen
{
namespace
internal
{
/* Optimized col-major matrix * vector product:
* This algorithm processes 4 columns at onces that allows to both reduce
* the number of load/stores of the result by a factor 4 and to reduce
* the instruction dependency. Moreover, we know that all bands have the
* same alignment pattern.
*
* Mixing type logic: C += alpha * A * B
* | A | B |alpha| comments
* |real |cplx |cplx | no vectorization
* |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
* |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
* |cplx |real |real | optimal case, vectorization possible via real-cplx mul
*/
template
<
typename
Index
,
typename
LhsScalar
,
bool
ConjugateLhs
,
typename
RhsScalar
,
bool
ConjugateRhs
,
int
Version
>
struct
general_matrix_vector_product
<
Index
,
LhsScalar
,
ColMajor
,
ConjugateLhs
,
RhsScalar
,
ConjugateRhs
,
Version
>
{
typedef
typename
scalar_product_traits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
enum
{
Vectorizable
=
packet_traits
<
LhsScalar
>::
Vectorizable
&&
packet_traits
<
RhsScalar
>::
Vectorizable
&&
int
(
packet_traits
<
LhsScalar
>::
size
)
==
int
(
packet_traits
<
RhsScalar
>::
size
),
LhsPacketSize
=
Vectorizable
?
packet_traits
<
LhsScalar
>::
size
:
1
,
RhsPacketSize
=
Vectorizable
?
packet_traits
<
RhsScalar
>::
size
:
1
,
ResPacketSize
=
Vectorizable
?
packet_traits
<
ResScalar
>::
size
:
1
};
typedef
typename
packet_traits
<
LhsScalar
>::
type
_LhsPacket
;
typedef
typename
packet_traits
<
RhsScalar
>::
type
_RhsPacket
;
typedef
typename
packet_traits
<
ResScalar
>::
type
_ResPacket
;
typedef
typename
conditional
<
Vectorizable
,
_LhsPacket
,
LhsScalar
>::
type
LhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
_RhsPacket
,
RhsScalar
>::
type
RhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
_ResPacket
,
ResScalar
>::
type
ResPacket
;
EIGEN_DONT_INLINE
static
void
run
(
Index
rows
,
Index
cols
,
const
LhsScalar
*
lhs
,
Index
lhsStride
,
const
RhsScalar
*
rhs
,
Index
rhsIncr
,
ResScalar
*
res
,
Index
#ifdef EIGEN_INTERNAL_DEBUGGING
resIncr
#endif
,
RhsScalar
alpha
)
{
eigen_internal_assert
(
resIncr
==
1
);
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
pstore(&res[j], \
padd(pload<ResPacket>(&res[j]), \
padd( \
padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
conj_helper
<
LhsScalar
,
RhsScalar
,
ConjugateLhs
,
ConjugateRhs
>
cj
;
conj_helper
<
LhsPacket
,
RhsPacket
,
ConjugateLhs
,
ConjugateRhs
>
pcj
;
if
(
ConjugateRhs
)
alpha
=
conj
(
alpha
);
enum
{
AllAligned
=
0
,
EvenAligned
,
FirstAligned
,
NoneAligned
};
const
Index
columnsAtOnce
=
4
;
const
Index
peels
=
2
;
const
Index
LhsPacketAlignedMask
=
LhsPacketSize
-
1
;
const
Index
ResPacketAlignedMask
=
ResPacketSize
-
1
;
const
Index
size
=
rows
;
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type.
Index
alignedStart
=
internal
::
first_aligned
(
res
,
size
);
Index
alignedSize
=
ResPacketSize
>
1
?
alignedStart
+
((
size
-
alignedStart
)
&
~
ResPacketAlignedMask
)
:
0
;
const
Index
peeledSize
=
alignedSize
-
RhsPacketSize
*
peels
-
RhsPacketSize
+
1
;
const
Index
alignmentStep
=
LhsPacketSize
>
1
?
(
LhsPacketSize
-
lhsStride
%
LhsPacketSize
)
&
LhsPacketAlignedMask
:
0
;
Index
alignmentPattern
=
alignmentStep
==
0
?
AllAligned
:
alignmentStep
==
(
LhsPacketSize
/
2
)
?
EvenAligned
:
FirstAligned
;
// we cannot assume the first element is aligned because of sub-matrices
const
Index
lhsAlignmentOffset
=
internal
::
first_aligned
(
lhs
,
size
);
// find how many columns do we have to skip to be aligned with the result (if possible)
Index
skipColumns
=
0
;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
if
(
(
size_t
(
lhs
)
%
sizeof
(
LhsScalar
))
||
(
size_t
(
res
)
%
sizeof
(
ResScalar
))
)
{
alignedSize
=
0
;
alignedStart
=
0
;
}
else
if
(
LhsPacketSize
>
1
)
{
eigen_internal_assert
(
size_t
(
lhs
+
lhsAlignmentOffset
)
%
sizeof
(
LhsPacket
)
==
0
||
size
<
LhsPacketSize
);
while
(
skipColumns
<
LhsPacketSize
&&
alignedStart
!=
((
lhsAlignmentOffset
+
alignmentStep
*
skipColumns
)
%
LhsPacketSize
))
++
skipColumns
;
if
(
skipColumns
==
LhsPacketSize
)
{
// nothing can be aligned, no need to skip any column
alignmentPattern
=
NoneAligned
;
skipColumns
=
0
;
}
else
{
skipColumns
=
(
std
::
min
)(
skipColumns
,
cols
);
// note that the skiped columns are processed later.
}
eigen_internal_assert
(
(
alignmentPattern
==
NoneAligned
)
||
(
skipColumns
+
columnsAtOnce
>=
cols
)
||
LhsPacketSize
>
size
||
(
size_t
(
lhs
+
alignedStart
+
lhsStride
*
skipColumns
)
%
sizeof
(
LhsPacket
))
==
0
);
}
else
if
(
Vectorizable
)
{
alignedStart
=
0
;
alignedSize
=
size
;
alignmentPattern
=
AllAligned
;
}
Index
offset1
=
(
FirstAligned
&&
alignmentStep
==
1
?
3
:
1
);
Index
offset3
=
(
FirstAligned
&&
alignmentStep
==
1
?
1
:
3
);
Index
columnBound
=
((
cols
-
skipColumns
)
/
columnsAtOnce
)
*
columnsAtOnce
+
skipColumns
;
for
(
Index
i
=
skipColumns
;
i
<
columnBound
;
i
+=
columnsAtOnce
)
{
RhsPacket
ptmp0
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
[
i
*
rhsIncr
]),
ptmp1
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
[(
i
+
offset1
)
*
rhsIncr
]),
ptmp2
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
[(
i
+
2
)
*
rhsIncr
]),
ptmp3
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
[(
i
+
offset3
)
*
rhsIncr
]);
// this helps a lot generating better binary code
const
LhsScalar
*
lhs0
=
lhs
+
i
*
lhsStride
,
*
lhs1
=
lhs
+
(
i
+
offset1
)
*
lhsStride
,
*
lhs2
=
lhs
+
(
i
+
2
)
*
lhsStride
,
*
lhs3
=
lhs
+
(
i
+
offset3
)
*
lhsStride
;
if
(
Vectorizable
)
{
/* explicit vectorization */
// process initial unaligned coeffs
for
(
Index
j
=
0
;
j
<
alignedStart
;
++
j
)
{
res
[
j
]
=
cj
.
pmadd
(
lhs0
[
j
],
pfirst
(
ptmp0
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs1
[
j
],
pfirst
(
ptmp1
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs2
[
j
],
pfirst
(
ptmp2
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs3
[
j
],
pfirst
(
ptmp3
),
res
[
j
]);
}
if
(
alignedSize
>
alignedStart
)
{
switch
(
alignmentPattern
)
{
case
AllAligned
:
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
ResPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
d
,
d
,
d
);
break
;
case
EvenAligned
:
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
ResPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
d
,
du
,
d
);
break
;
case
FirstAligned
:
{
Index
j
=
alignedStart
;
if
(
peels
>
1
)
{
LhsPacket
A00
,
A01
,
A02
,
A03
,
A10
,
A11
,
A12
,
A13
;
ResPacket
T0
,
T1
;
A01
=
pload
<
LhsPacket
>
(
&
lhs1
[
alignedStart
-
1
]);
A02
=
pload
<
LhsPacket
>
(
&
lhs2
[
alignedStart
-
2
]);
A03
=
pload
<
LhsPacket
>
(
&
lhs3
[
alignedStart
-
3
]);
for
(;
j
<
peeledSize
;
j
+=
peels
*
ResPacketSize
)
{
A11
=
pload
<
LhsPacket
>
(
&
lhs1
[
j
-
1
+
LhsPacketSize
]);
palign
<
1
>
(
A01
,
A11
);
A12
=
pload
<
LhsPacket
>
(
&
lhs2
[
j
-
2
+
LhsPacketSize
]);
palign
<
2
>
(
A02
,
A12
);
A13
=
pload
<
LhsPacket
>
(
&
lhs3
[
j
-
3
+
LhsPacketSize
]);
palign
<
3
>
(
A03
,
A13
);
A00
=
pload
<
LhsPacket
>
(
&
lhs0
[
j
]);
A10
=
pload
<
LhsPacket
>
(
&
lhs0
[
j
+
LhsPacketSize
]);
T0
=
pcj
.
pmadd
(
A00
,
ptmp0
,
pload
<
ResPacket
>
(
&
res
[
j
]));
T1
=
pcj
.
pmadd
(
A10
,
ptmp0
,
pload
<
ResPacket
>
(
&
res
[
j
+
ResPacketSize
]));
T0
=
pcj
.
pmadd
(
A01
,
ptmp1
,
T0
);
A01
=
pload
<
LhsPacket
>
(
&
lhs1
[
j
-
1
+
2
*
LhsPacketSize
]);
palign
<
1
>
(
A11
,
A01
);
T0
=
pcj
.
pmadd
(
A02
,
ptmp2
,
T0
);
A02
=
pload
<
LhsPacket
>
(
&
lhs2
[
j
-
2
+
2
*
LhsPacketSize
]);
palign
<
2
>
(
A12
,
A02
);
T0
=
pcj
.
pmadd
(
A03
,
ptmp3
,
T0
);
pstore
(
&
res
[
j
],
T0
);
A03
=
pload
<
LhsPacket
>
(
&
lhs3
[
j
-
3
+
2
*
LhsPacketSize
]);
palign
<
3
>
(
A13
,
A03
);
T1
=
pcj
.
pmadd
(
A11
,
ptmp1
,
T1
);
T1
=
pcj
.
pmadd
(
A12
,
ptmp2
,
T1
);
T1
=
pcj
.
pmadd
(
A13
,
ptmp3
,
T1
);
pstore
(
&
res
[
j
+
ResPacketSize
],
T1
);
}
}
for
(;
j
<
alignedSize
;
j
+=
ResPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
d
,
du
,
du
);
break
;
}
default:
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
ResPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
du
,
du
,
du
);
break
;
}
}
}
// end explicit vectorization
/* process remaining coeffs (or all if there is no explicit vectorization) */
for
(
Index
j
=
alignedSize
;
j
<
size
;
++
j
)
{
res
[
j
]
=
cj
.
pmadd
(
lhs0
[
j
],
pfirst
(
ptmp0
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs1
[
j
],
pfirst
(
ptmp1
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs2
[
j
],
pfirst
(
ptmp2
),
res
[
j
]);
res
[
j
]
=
cj
.
pmadd
(
lhs3
[
j
],
pfirst
(
ptmp3
),
res
[
j
]);
}
}
// process remaining first and last columns (at most columnsAtOnce-1)
Index
end
=
cols
;
Index
start
=
columnBound
;
do
{
for
(
Index
k
=
start
;
k
<
end
;
++
k
)
{
RhsPacket
ptmp0
=
pset1
<
RhsPacket
>
(
alpha
*
rhs
[
k
*
rhsIncr
]);
const
LhsScalar
*
lhs0
=
lhs
+
k
*
lhsStride
;
if
(
Vectorizable
)
{
/* explicit vectorization */
// process first unaligned result's coeffs
for
(
Index
j
=
0
;
j
<
alignedStart
;
++
j
)
res
[
j
]
+=
cj
.
pmul
(
lhs0
[
j
],
pfirst
(
ptmp0
));
// process aligned result's coeffs
if
((
size_t
(
lhs0
+
alignedStart
)
%
sizeof
(
LhsPacket
))
==
0
)
for
(
Index
i
=
alignedStart
;
i
<
alignedSize
;
i
+=
ResPacketSize
)
pstore
(
&
res
[
i
],
pcj
.
pmadd
(
ploadu
<
LhsPacket
>
(
&
lhs0
[
i
]),
ptmp0
,
pload
<
ResPacket
>
(
&
res
[
i
])));
else
for
(
Index
i
=
alignedStart
;
i
<
alignedSize
;
i
+=
ResPacketSize
)
pstore
(
&
res
[
i
],
pcj
.
pmadd
(
ploadu
<
LhsPacket
>
(
&
lhs0
[
i
]),
ptmp0
,
pload
<
ResPacket
>
(
&
res
[
i
])));
}
// process remaining scalars (or all if no explicit vectorization)
for
(
Index
i
=
alignedSize
;
i
<
size
;
++
i
)
res
[
i
]
+=
cj
.
pmul
(
lhs0
[
i
],
pfirst
(
ptmp0
));
}
if
(
skipColumns
)
{
start
=
0
;
end
=
skipColumns
;
skipColumns
=
0
;
}
else
break
;
}
while
(
Vectorizable
);
#undef _EIGEN_ACCUMULATE_PACKETS
}
};
/* Optimized row-major matrix * vector product:
* This algorithm processes 4 rows at onces that allows to both reduce
* the number of load/stores of the result by a factor 4 and to reduce
* the instruction dependency. Moreover, we know that all bands have the
* same alignment pattern.
*
* Mixing type logic:
* - alpha is always a complex (or converted to a complex)
* - no vectorization
*/
template
<
typename
Index
,
typename
LhsScalar
,
bool
ConjugateLhs
,
typename
RhsScalar
,
bool
ConjugateRhs
,
int
Version
>
struct
general_matrix_vector_product
<
Index
,
LhsScalar
,
RowMajor
,
ConjugateLhs
,
RhsScalar
,
ConjugateRhs
,
Version
>
{
typedef
typename
scalar_product_traits
<
LhsScalar
,
RhsScalar
>::
ReturnType
ResScalar
;
enum
{
Vectorizable
=
packet_traits
<
LhsScalar
>::
Vectorizable
&&
packet_traits
<
RhsScalar
>::
Vectorizable
&&
int
(
packet_traits
<
LhsScalar
>::
size
)
==
int
(
packet_traits
<
RhsScalar
>::
size
),
LhsPacketSize
=
Vectorizable
?
packet_traits
<
LhsScalar
>::
size
:
1
,
RhsPacketSize
=
Vectorizable
?
packet_traits
<
RhsScalar
>::
size
:
1
,
ResPacketSize
=
Vectorizable
?
packet_traits
<
ResScalar
>::
size
:
1
};
typedef
typename
packet_traits
<
LhsScalar
>::
type
_LhsPacket
;
typedef
typename
packet_traits
<
RhsScalar
>::
type
_RhsPacket
;
typedef
typename
packet_traits
<
ResScalar
>::
type
_ResPacket
;
typedef
typename
conditional
<
Vectorizable
,
_LhsPacket
,
LhsScalar
>::
type
LhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
_RhsPacket
,
RhsScalar
>::
type
RhsPacket
;
typedef
typename
conditional
<
Vectorizable
,
_ResPacket
,
ResScalar
>::
type
ResPacket
;
EIGEN_DONT_INLINE
static
void
run
(
Index
rows
,
Index
cols
,
const
LhsScalar
*
lhs
,
Index
lhsStride
,
const
RhsScalar
*
rhs
,
Index
rhsIncr
,
ResScalar
*
res
,
Index
resIncr
,
ResScalar
alpha
)
{
EIGEN_UNUSED_VARIABLE
(
rhsIncr
);
eigen_internal_assert
(
rhsIncr
==
1
);
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
RhsPacket b = pload<RhsPacket>(&rhs[j]); \
ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
conj_helper
<
LhsScalar
,
RhsScalar
,
ConjugateLhs
,
ConjugateRhs
>
cj
;
conj_helper
<
LhsPacket
,
RhsPacket
,
ConjugateLhs
,
ConjugateRhs
>
pcj
;
enum
{
AllAligned
=
0
,
EvenAligned
=
1
,
FirstAligned
=
2
,
NoneAligned
=
3
};
const
Index
rowsAtOnce
=
4
;
const
Index
peels
=
2
;
const
Index
RhsPacketAlignedMask
=
RhsPacketSize
-
1
;
const
Index
LhsPacketAlignedMask
=
LhsPacketSize
-
1
;
const
Index
depth
=
cols
;
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type
// if that's not the case then vectorization is discarded, see below.
Index
alignedStart
=
internal
::
first_aligned
(
rhs
,
depth
);
Index
alignedSize
=
RhsPacketSize
>
1
?
alignedStart
+
((
depth
-
alignedStart
)
&
~
RhsPacketAlignedMask
)
:
0
;
const
Index
peeledSize
=
alignedSize
-
RhsPacketSize
*
peels
-
RhsPacketSize
+
1
;
const
Index
alignmentStep
=
LhsPacketSize
>
1
?
(
LhsPacketSize
-
lhsStride
%
LhsPacketSize
)
&
LhsPacketAlignedMask
:
0
;
Index
alignmentPattern
=
alignmentStep
==
0
?
AllAligned
:
alignmentStep
==
(
LhsPacketSize
/
2
)
?
EvenAligned
:
FirstAligned
;
// we cannot assume the first element is aligned because of sub-matrices
const
Index
lhsAlignmentOffset
=
internal
::
first_aligned
(
lhs
,
depth
);
// find how many rows do we have to skip to be aligned with rhs (if possible)
Index
skipRows
=
0
;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
if
(
(
sizeof
(
LhsScalar
)
!=
sizeof
(
RhsScalar
))
||
(
size_t
(
lhs
)
%
sizeof
(
LhsScalar
))
||
(
size_t
(
rhs
)
%
sizeof
(
RhsScalar
))
)
{
alignedSize
=
0
;
alignedStart
=
0
;
}
else
if
(
LhsPacketSize
>
1
)
{
eigen_internal_assert
(
size_t
(
lhs
+
lhsAlignmentOffset
)
%
sizeof
(
LhsPacket
)
==
0
||
depth
<
LhsPacketSize
);
while
(
skipRows
<
LhsPacketSize
&&
alignedStart
!=
((
lhsAlignmentOffset
+
alignmentStep
*
skipRows
)
%
LhsPacketSize
))
++
skipRows
;
if
(
skipRows
==
LhsPacketSize
)
{
// nothing can be aligned, no need to skip any column
alignmentPattern
=
NoneAligned
;
skipRows
=
0
;
}
else
{
skipRows
=
(
std
::
min
)(
skipRows
,
Index
(
rows
));
// note that the skiped columns are processed later.
}
eigen_internal_assert
(
alignmentPattern
==
NoneAligned
||
LhsPacketSize
==
1
||
(
skipRows
+
rowsAtOnce
>=
rows
)
||
LhsPacketSize
>
depth
||
(
size_t
(
lhs
+
alignedStart
+
lhsStride
*
skipRows
)
%
sizeof
(
LhsPacket
))
==
0
);
}
else
if
(
Vectorizable
)
{
alignedStart
=
0
;
alignedSize
=
depth
;
alignmentPattern
=
AllAligned
;
}
Index
offset1
=
(
FirstAligned
&&
alignmentStep
==
1
?
3
:
1
);
Index
offset3
=
(
FirstAligned
&&
alignmentStep
==
1
?
1
:
3
);
Index
rowBound
=
((
rows
-
skipRows
)
/
rowsAtOnce
)
*
rowsAtOnce
+
skipRows
;
for
(
Index
i
=
skipRows
;
i
<
rowBound
;
i
+=
rowsAtOnce
)
{
EIGEN_ALIGN16
ResScalar
tmp0
=
ResScalar
(
0
);
ResScalar
tmp1
=
ResScalar
(
0
),
tmp2
=
ResScalar
(
0
),
tmp3
=
ResScalar
(
0
);
// this helps the compiler generating good binary code
const
LhsScalar
*
lhs0
=
lhs
+
i
*
lhsStride
,
*
lhs1
=
lhs
+
(
i
+
offset1
)
*
lhsStride
,
*
lhs2
=
lhs
+
(
i
+
2
)
*
lhsStride
,
*
lhs3
=
lhs
+
(
i
+
offset3
)
*
lhsStride
;
if
(
Vectorizable
)
{
/* explicit vectorization */
ResPacket
ptmp0
=
pset1
<
ResPacket
>
(
ResScalar
(
0
)),
ptmp1
=
pset1
<
ResPacket
>
(
ResScalar
(
0
)),
ptmp2
=
pset1
<
ResPacket
>
(
ResScalar
(
0
)),
ptmp3
=
pset1
<
ResPacket
>
(
ResScalar
(
0
));
// process initial unaligned coeffs
// FIXME this loop get vectorized by the compiler !
for
(
Index
j
=
0
;
j
<
alignedStart
;
++
j
)
{
RhsScalar
b
=
rhs
[
j
];
tmp0
+=
cj
.
pmul
(
lhs0
[
j
],
b
);
tmp1
+=
cj
.
pmul
(
lhs1
[
j
],
b
);
tmp2
+=
cj
.
pmul
(
lhs2
[
j
],
b
);
tmp3
+=
cj
.
pmul
(
lhs3
[
j
],
b
);
}
if
(
alignedSize
>
alignedStart
)
{
switch
(
alignmentPattern
)
{
case
AllAligned
:
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
RhsPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
d
,
d
,
d
);
break
;
case
EvenAligned
:
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
RhsPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
d
,
du
,
d
);
break
;
case
FirstAligned
:
{
Index
j
=
alignedStart
;
if
(
peels
>
1
)
{
/* Here we proccess 4 rows with with two peeled iterations to hide
* the overhead of unaligned loads. Moreover unaligned loads are handled
* using special shift/move operations between the two aligned packets
* overlaping the desired unaligned packet. This is *much* more efficient
* than basic unaligned loads.
*/
LhsPacket
A01
,
A02
,
A03
,
A11
,
A12
,
A13
;
A01
=
pload
<
LhsPacket
>
(
&
lhs1
[
alignedStart
-
1
]);
A02
=
pload
<
LhsPacket
>
(
&
lhs2
[
alignedStart
-
2
]);
A03
=
pload
<
LhsPacket
>
(
&
lhs3
[
alignedStart
-
3
]);
for
(;
j
<
peeledSize
;
j
+=
peels
*
RhsPacketSize
)
{
RhsPacket
b
=
pload
<
RhsPacket
>
(
&
rhs
[
j
]);
A11
=
pload
<
LhsPacket
>
(
&
lhs1
[
j
-
1
+
LhsPacketSize
]);
palign
<
1
>
(
A01
,
A11
);
A12
=
pload
<
LhsPacket
>
(
&
lhs2
[
j
-
2
+
LhsPacketSize
]);
palign
<
2
>
(
A02
,
A12
);
A13
=
pload
<
LhsPacket
>
(
&
lhs3
[
j
-
3
+
LhsPacketSize
]);
palign
<
3
>
(
A03
,
A13
);
ptmp0
=
pcj
.
pmadd
(
pload
<
LhsPacket
>
(
&
lhs0
[
j
]),
b
,
ptmp0
);
ptmp1
=
pcj
.
pmadd
(
A01
,
b
,
ptmp1
);
A01
=
pload
<
LhsPacket
>
(
&
lhs1
[
j
-
1
+
2
*
LhsPacketSize
]);
palign
<
1
>
(
A11
,
A01
);
ptmp2
=
pcj
.
pmadd
(
A02
,
b
,
ptmp2
);
A02
=
pload
<
LhsPacket
>
(
&
lhs2
[
j
-
2
+
2
*
LhsPacketSize
]);
palign
<
2
>
(
A12
,
A02
);
ptmp3
=
pcj
.
pmadd
(
A03
,
b
,
ptmp3
);
A03
=
pload
<
LhsPacket
>
(
&
lhs3
[
j
-
3
+
2
*
LhsPacketSize
]);
palign
<
3
>
(
A13
,
A03
);
b
=
pload
<
RhsPacket
>
(
&
rhs
[
j
+
RhsPacketSize
]);
ptmp0
=
pcj
.
pmadd
(
pload
<
LhsPacket
>
(
&
lhs0
[
j
+
LhsPacketSize
]),
b
,
ptmp0
);
ptmp1
=
pcj
.
pmadd
(
A11
,
b
,
ptmp1
);
ptmp2
=
pcj
.
pmadd
(
A12
,
b
,
ptmp2
);
ptmp3
=
pcj
.
pmadd
(
A13
,
b
,
ptmp3
);
}
}
for
(;
j
<
alignedSize
;
j
+=
RhsPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
d
,
du
,
du
);
break
;
}
default:
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
RhsPacketSize
)
_EIGEN_ACCUMULATE_PACKETS
(
du
,
du
,
du
);
break
;
}
tmp0
+=
predux
(
ptmp0
);
tmp1
+=
predux
(
ptmp1
);
tmp2
+=
predux
(
ptmp2
);
tmp3
+=
predux
(
ptmp3
);
}
}
// end explicit vectorization
// process remaining coeffs (or all if no explicit vectorization)
// FIXME this loop get vectorized by the compiler !
for
(
Index
j
=
alignedSize
;
j
<
depth
;
++
j
)
{
RhsScalar
b
=
rhs
[
j
];
tmp0
+=
cj
.
pmul
(
lhs0
[
j
],
b
);
tmp1
+=
cj
.
pmul
(
lhs1
[
j
],
b
);
tmp2
+=
cj
.
pmul
(
lhs2
[
j
],
b
);
tmp3
+=
cj
.
pmul
(
lhs3
[
j
],
b
);
}
res
[
i
*
resIncr
]
+=
alpha
*
tmp0
;
res
[(
i
+
offset1
)
*
resIncr
]
+=
alpha
*
tmp1
;
res
[(
i
+
2
)
*
resIncr
]
+=
alpha
*
tmp2
;
res
[(
i
+
offset3
)
*
resIncr
]
+=
alpha
*
tmp3
;
}
// process remaining first and last rows (at most columnsAtOnce-1)
Index
end
=
rows
;
Index
start
=
rowBound
;
do
{
for
(
Index
i
=
start
;
i
<
end
;
++
i
)
{
EIGEN_ALIGN16
ResScalar
tmp0
=
ResScalar
(
0
);
ResPacket
ptmp0
=
pset1
<
ResPacket
>
(
tmp0
);
const
LhsScalar
*
lhs0
=
lhs
+
i
*
lhsStride
;
// process first unaligned result's coeffs
// FIXME this loop get vectorized by the compiler !
for
(
Index
j
=
0
;
j
<
alignedStart
;
++
j
)
tmp0
+=
cj
.
pmul
(
lhs0
[
j
],
rhs
[
j
]);
if
(
alignedSize
>
alignedStart
)
{
// process aligned rhs coeffs
if
((
size_t
(
lhs0
+
alignedStart
)
%
sizeof
(
LhsPacket
))
==
0
)
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
RhsPacketSize
)
ptmp0
=
pcj
.
pmadd
(
pload
<
LhsPacket
>
(
&
lhs0
[
j
]),
pload
<
RhsPacket
>
(
&
rhs
[
j
]),
ptmp0
);
else
for
(
Index
j
=
alignedStart
;
j
<
alignedSize
;
j
+=
RhsPacketSize
)
ptmp0
=
pcj
.
pmadd
(
ploadu
<
LhsPacket
>
(
&
lhs0
[
j
]),
pload
<
RhsPacket
>
(
&
rhs
[
j
]),
ptmp0
);
tmp0
+=
predux
(
ptmp0
);
}
// process remaining scalars
// FIXME this loop get vectorized by the compiler !
for
(
Index
j
=
alignedSize
;
j
<
depth
;
++
j
)
tmp0
+=
cj
.
pmul
(
lhs0
[
j
],
rhs
[
j
]);
res
[
i
*
resIncr
]
+=
alpha
*
tmp0
;
}
if
(
skipRows
)
{
start
=
0
;
end
=
skipRows
;
skipRows
=
0
;
}
else
break
;
}
while
(
Vectorizable
);
#undef _EIGEN_ACCUMULATE_PACKETS
}
};
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
densecrf/include/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
0 → 100644
View file @
da2c1226
/*
Copyright (c) 2011, Intel Corporation. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors may
be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************************
* Content : Eigen bindings to Intel(R) MKL
* General matrix-vector product functionality based on ?GEMV.
********************************************************************************
*/
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
#define EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
namespace
Eigen
{
namespace
internal
{
/**********************************************************************
* This file implements general matrix-vector multiplication using BLAS
* gemv function via partial specialization of
* general_matrix_vector_product::run(..) method for float, double,
* std::complex<float> and std::complex<double> types
**********************************************************************/
// gemv specialization
template
<
typename
Index
,
typename
LhsScalar
,
int
LhsStorageOrder
,
bool
ConjugateLhs
,
typename
RhsScalar
,
bool
ConjugateRhs
>
struct
general_matrix_vector_product_gemv
:
general_matrix_vector_product
<
Index
,
LhsScalar
,
LhsStorageOrder
,
ConjugateLhs
,
RhsScalar
,
ConjugateRhs
,
BuiltIn
>
{};
#define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \
template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
struct general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
static EIGEN_DONT_INLINE void run( \
Index rows, Index cols, \
const Scalar* lhs, Index lhsStride, \
const Scalar* rhs, Index rhsIncr, \
Scalar* res, Index resIncr, Scalar alpha) \
{ \
if (ConjugateLhs) { \
general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,BuiltIn>::run( \
rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
} else { \
general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
} \
} \
}; \
template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
struct general_matrix_vector_product<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
static EIGEN_DONT_INLINE void run( \
Index rows, Index cols, \
const Scalar* lhs, Index lhsStride, \
const Scalar* rhs, Index rhsIncr, \
Scalar* res, Index resIncr, Scalar alpha) \
{ \
general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
} \
}; \
EIGEN_MKL_GEMV_SPECIALIZE
(
double
)
EIGEN_MKL_GEMV_SPECIALIZE
(
float
)
EIGEN_MKL_GEMV_SPECIALIZE
(
dcomplex
)
EIGEN_MKL_GEMV_SPECIALIZE
(
scomplex
)
#define EIGEN_MKL_GEMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLPREFIX) \
template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
{ \
typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> GEMVVector;\
\
static EIGEN_DONT_INLINE void run( \
Index rows, Index cols, \
const EIGTYPE* lhs, Index lhsStride, \
const EIGTYPE* rhs, Index rhsIncr, \
EIGTYPE* res, Index resIncr, EIGTYPE alpha) \
{ \
MKL_INT m=rows, n=cols, lda=lhsStride, incx=rhsIncr, incy=resIncr; \
MKLTYPE alpha_, beta_; \
const EIGTYPE *x_ptr, myone(1); \
char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \
if (LhsStorageOrder==RowMajor) { \
m=cols; \
n=rows; \
}\
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
GEMVVector x_tmp; \
if (ConjugateRhs) { \
Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \
x_tmp=map_x.conjugate(); \
x_ptr=x_tmp.data(); \
incx=1; \
} else x_ptr=rhs; \
MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
}\
};
EIGEN_MKL_GEMV_SPECIALIZATION
(
double
,
double
,
d
)
EIGEN_MKL_GEMV_SPECIALIZATION
(
float
,
float
,
s
)
EIGEN_MKL_GEMV_SPECIALIZATION
(
dcomplex
,
MKL_Complex16
,
z
)
EIGEN_MKL_GEMV_SPECIALIZATION
(
scomplex
,
MKL_Complex8
,
c
)
}
// end namespase internal
}
// end namespace Eigen
#endif // EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
densecrf/include/Eigen/src/Core/products/Parallelizer.h
0 → 100644
View file @
da2c1226
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PARALLELIZER_H
#define EIGEN_PARALLELIZER_H
namespace
Eigen
{
namespace
internal
{
/** \internal */
inline
void
manage_multi_threading
(
Action
action
,
int
*
v
)
{
static
EIGEN_UNUSED
int
m_maxThreads
=
-
1
;
if
(
action
==
SetAction
)
{
eigen_internal_assert
(
v
!=
0
);
m_maxThreads
=
*
v
;
}
else
if
(
action
==
GetAction
)
{
eigen_internal_assert
(
v
!=
0
);
#ifdef EIGEN_HAS_OPENMP
if
(
m_maxThreads
>
0
)
*
v
=
m_maxThreads
;
else
*
v
=
omp_get_max_threads
();
#else
*
v
=
1
;
#endif
}
else
{
eigen_internal_assert
(
false
);
}
}
}
/** Must be call first when calling Eigen from multiple threads */
inline
void
initParallel
()
{
int
nbt
;
internal
::
manage_multi_threading
(
GetAction
,
&
nbt
);
std
::
ptrdiff_t
l1
,
l2
;
internal
::
manage_caching_sizes
(
GetAction
,
&
l1
,
&
l2
);
}
/** \returns the max number of threads reserved for Eigen
* \sa setNbThreads */
inline
int
nbThreads
()
{
int
ret
;
internal
::
manage_multi_threading
(
GetAction
,
&
ret
);
return
ret
;
}
/** Sets the max number of threads reserved for Eigen
* \sa nbThreads */
inline
void
setNbThreads
(
int
v
)
{
internal
::
manage_multi_threading
(
SetAction
,
&
v
);
}
namespace
internal
{
template
<
typename
Index
>
struct
GemmParallelInfo
{
GemmParallelInfo
()
:
sync
(
-
1
),
users
(
0
),
rhs_start
(
0
),
rhs_length
(
0
)
{}
int
volatile
sync
;
int
volatile
users
;
Index
rhs_start
;
Index
rhs_length
;
};
template
<
bool
Condition
,
typename
Functor
,
typename
Index
>
void
parallelize_gemm
(
const
Functor
&
func
,
Index
rows
,
Index
cols
,
bool
transpose
)
{
// TODO when EIGEN_USE_BLAS is defined,
// we should still enable OMP for other scalar types
#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS)
// FIXME the transpose variable is only needed to properly split
// the matrix product when multithreading is enabled. This is a temporary
// fix to support row-major destination matrices. This whole
// parallelizer mechanism has to be redisigned anyway.
EIGEN_UNUSED_VARIABLE
(
transpose
);
func
(
0
,
rows
,
0
,
cols
);
#else
// Dynamically check whether we should enable or disable OpenMP.
// The conditions are:
// - the max number of threads we can create is greater than 1
// - we are not already in a parallel code
// - the sizes are large enough
// 1- are we already in a parallel session?
// FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
if
((
!
Condition
)
||
(
omp_get_num_threads
()
>
1
))
return
func
(
0
,
rows
,
0
,
cols
);
Index
size
=
transpose
?
cols
:
rows
;
// 2- compute the maximal number of threads from the size of the product:
// FIXME this has to be fine tuned
Index
max_threads
=
std
::
max
<
Index
>
(
1
,
size
/
32
);
// 3 - compute the number of threads we are going to use
Index
threads
=
std
::
min
<
Index
>
(
nbThreads
(),
max_threads
);
if
(
threads
==
1
)
return
func
(
0
,
rows
,
0
,
cols
);
Eigen
::
initParallel
();
func
.
initParallelSession
();
if
(
transpose
)
std
::
swap
(
rows
,
cols
);
Index
blockCols
=
(
cols
/
threads
)
&
~
Index
(
0x3
);
Index
blockRows
=
(
rows
/
threads
)
&
~
Index
(
0x7
);
GemmParallelInfo
<
Index
>*
info
=
new
GemmParallelInfo
<
Index
>
[
threads
];
#pragma omp parallel for schedule(static,1) num_threads(threads)
for
(
Index
i
=
0
;
i
<
threads
;
++
i
)
{
Index
r0
=
i
*
blockRows
;
Index
actualBlockRows
=
(
i
+
1
==
threads
)
?
rows
-
r0
:
blockRows
;
Index
c0
=
i
*
blockCols
;
Index
actualBlockCols
=
(
i
+
1
==
threads
)
?
cols
-
c0
:
blockCols
;
info
[
i
].
rhs_start
=
c0
;
info
[
i
].
rhs_length
=
actualBlockCols
;
if
(
transpose
)
func
(
0
,
cols
,
r0
,
actualBlockRows
,
info
);
else
func
(
r0
,
actualBlockRows
,
0
,
cols
,
info
);
}
delete
[]
info
;
#endif
}
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_PARALLELIZER_H
densecrf/include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
0 → 100644
View file @
da2c1226
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H
#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H
namespace
Eigen
{
namespace
internal
{
// pack a selfadjoint block diagonal for use with the gebp_kernel
template
<
typename
Scalar
,
typename
Index
,
int
Pack1
,
int
Pack2
,
int
StorageOrder
>
struct
symm_pack_lhs
{
template
<
int
BlockRows
>
inline
void
pack
(
Scalar
*
blockA
,
const
const_blas_data_mapper
<
Scalar
,
Index
,
StorageOrder
>&
lhs
,
Index
cols
,
Index
i
,
Index
&
count
)
{
// normal copy
for
(
Index
k
=
0
;
k
<
i
;
k
++
)
for
(
Index
w
=
0
;
w
<
BlockRows
;
w
++
)
blockA
[
count
++
]
=
lhs
(
i
+
w
,
k
);
// normal
// symmetric copy
Index
h
=
0
;
for
(
Index
k
=
i
;
k
<
i
+
BlockRows
;
k
++
)
{
for
(
Index
w
=
0
;
w
<
h
;
w
++
)
blockA
[
count
++
]
=
conj
(
lhs
(
k
,
i
+
w
));
// transposed
blockA
[
count
++
]
=
real
(
lhs
(
k
,
k
));
// real (diagonal)
for
(
Index
w
=
h
+
1
;
w
<
BlockRows
;
w
++
)
blockA
[
count
++
]
=
lhs
(
i
+
w
,
k
);
// normal
++
h
;
}
// transposed copy
for
(
Index
k
=
i
+
BlockRows
;
k
<
cols
;
k
++
)
for
(
Index
w
=
0
;
w
<
BlockRows
;
w
++
)
blockA
[
count
++
]
=
conj
(
lhs
(
k
,
i
+
w
));
// transposed
}
void
operator
()(
Scalar
*
blockA
,
const
Scalar
*
_lhs
,
Index
lhsStride
,
Index
cols
,
Index
rows
)
{
const_blas_data_mapper
<
Scalar
,
Index
,
StorageOrder
>
lhs
(
_lhs
,
lhsStride
);
Index
count
=
0
;
Index
peeled_mc
=
(
rows
/
Pack1
)
*
Pack1
;
for
(
Index
i
=
0
;
i
<
peeled_mc
;
i
+=
Pack1
)
{
pack
<
Pack1
>
(
blockA
,
lhs
,
cols
,
i
,
count
);
}
if
(
rows
-
peeled_mc
>=
Pack2
)
{
pack
<
Pack2
>
(
blockA
,
lhs
,
cols
,
peeled_mc
,
count
);
peeled_mc
+=
Pack2
;
}
// do the same with mr==1
for
(
Index
i
=
peeled_mc
;
i
<
rows
;
i
++
)
{
for
(
Index
k
=
0
;
k
<
i
;
k
++
)
blockA
[
count
++
]
=
lhs
(
i
,
k
);
// normal
blockA
[
count
++
]
=
real
(
lhs
(
i
,
i
));
// real (diagonal)
for
(
Index
k
=
i
+
1
;
k
<
cols
;
k
++
)
blockA
[
count
++
]
=
conj
(
lhs
(
k
,
i
));
// transposed
}
}
};
template
<
typename
Scalar
,
typename
Index
,
int
nr
,
int
StorageOrder
>
struct
symm_pack_rhs
{
enum
{
PacketSize
=
packet_traits
<
Scalar
>::
size
};
void
operator
()(
Scalar
*
blockB
,
const
Scalar
*
_rhs
,
Index
rhsStride
,
Index
rows
,
Index
cols
,
Index
k2
)
{
Index
end_k
=
k2
+
rows
;
Index
count
=
0
;
const_blas_data_mapper
<
Scalar
,
Index
,
StorageOrder
>
rhs
(
_rhs
,
rhsStride
);
Index
packet_cols
=
(
cols
/
nr
)
*
nr
;
// first part: normal case
for
(
Index
j2
=
0
;
j2
<
k2
;
j2
+=
nr
)
{
for
(
Index
k
=
k2
;
k
<
end_k
;
k
++
)
{
blockB
[
count
+
0
]
=
rhs
(
k
,
j2
+
0
);
blockB
[
count
+
1
]
=
rhs
(
k
,
j2
+
1
);
if
(
nr
==
4
)
{
blockB
[
count
+
2
]
=
rhs
(
k
,
j2
+
2
);
blockB
[
count
+
3
]
=
rhs
(
k
,
j2
+
3
);
}
count
+=
nr
;
}
}
// second part: diagonal block
for
(
Index
j2
=
k2
;
j2
<
(
std
::
min
)(
k2
+
rows
,
packet_cols
);
j2
+=
nr
)
{
// again we can split vertically in three different parts (transpose, symmetric, normal)
// transpose
for
(
Index
k
=
k2
;
k
<
j2
;
k
++
)
{
blockB
[
count
+
0
]
=
conj
(
rhs
(
j2
+
0
,
k
));
blockB
[
count
+
1
]
=
conj
(
rhs
(
j2
+
1
,
k
));
if
(
nr
==
4
)
{
blockB
[
count
+
2
]
=
conj
(
rhs
(
j2
+
2
,
k
));
blockB
[
count
+
3
]
=
conj
(
rhs
(
j2
+
3
,
k
));
}
count
+=
nr
;
}
// symmetric
Index
h
=
0
;
for
(
Index
k
=
j2
;
k
<
j2
+
nr
;
k
++
)
{
// normal
for
(
Index
w
=
0
;
w
<
h
;
++
w
)
blockB
[
count
+
w
]
=
rhs
(
k
,
j2
+
w
);
blockB
[
count
+
h
]
=
real
(
rhs
(
k
,
k
));
// transpose
for
(
Index
w
=
h
+
1
;
w
<
nr
;
++
w
)
blockB
[
count
+
w
]
=
conj
(
rhs
(
j2
+
w
,
k
));
count
+=
nr
;
++
h
;
}
// normal
for
(
Index
k
=
j2
+
nr
;
k
<
end_k
;
k
++
)
{
blockB
[
count
+
0
]
=
rhs
(
k
,
j2
+
0
);
blockB
[
count
+
1
]
=
rhs
(
k
,
j2
+
1
);
if
(
nr
==
4
)
{
blockB
[
count
+
2
]
=
rhs
(
k
,
j2
+
2
);
blockB
[
count
+
3
]
=
rhs
(
k
,
j2
+
3
);
}
count
+=
nr
;
}
}
// third part: transposed
for
(
Index
j2
=
k2
+
rows
;
j2
<
packet_cols
;
j2
+=
nr
)
{
for
(
Index
k
=
k2
;
k
<
end_k
;
k
++
)
{
blockB
[
count
+
0
]
=
conj
(
rhs
(
j2
+
0
,
k
));
blockB
[
count
+
1
]
=
conj
(
rhs
(
j2
+
1
,
k
));
if
(
nr
==
4
)
{
blockB
[
count
+
2
]
=
conj
(
rhs
(
j2
+
2
,
k
));
blockB
[
count
+
3
]
=
conj
(
rhs
(
j2
+
3
,
k
));
}
count
+=
nr
;
}
}
// copy the remaining columns one at a time (=> the same with nr==1)
for
(
Index
j2
=
packet_cols
;
j2
<
cols
;
++
j2
)
{
// transpose
Index
half
=
(
std
::
min
)(
end_k
,
j2
);
for
(
Index
k
=
k2
;
k
<
half
;
k
++
)
{
blockB
[
count
]
=
conj
(
rhs
(
j2
,
k
));
count
+=
1
;
}
if
(
half
==
j2
&&
half
<
k2
+
rows
)
{
blockB
[
count
]
=
real
(
rhs
(
j2
,
j2
));
count
+=
1
;
}
else
half
--
;
// normal
for
(
Index
k
=
half
+
1
;
k
<
k2
+
rows
;
k
++
)
{
blockB
[
count
]
=
rhs
(
k
,
j2
);
count
+=
1
;
}
}
}
};
/* Optimized selfadjoint matrix * matrix (_SYMM) product built on top of
* the general matrix matrix product.
*/
template
<
typename
Scalar
,
typename
Index
,
int
LhsStorageOrder
,
bool
LhsSelfAdjoint
,
bool
ConjugateLhs
,
int
RhsStorageOrder
,
bool
RhsSelfAdjoint
,
bool
ConjugateRhs
,
int
ResStorageOrder
>
struct
product_selfadjoint_matrix
;
template
<
typename
Scalar
,
typename
Index
,
int
LhsStorageOrder
,
bool
LhsSelfAdjoint
,
bool
ConjugateLhs
,
int
RhsStorageOrder
,
bool
RhsSelfAdjoint
,
bool
ConjugateRhs
>
struct
product_selfadjoint_matrix
<
Scalar
,
Index
,
LhsStorageOrder
,
LhsSelfAdjoint
,
ConjugateLhs
,
RhsStorageOrder
,
RhsSelfAdjoint
,
ConjugateRhs
,
RowMajor
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Index
rows
,
Index
cols
,
const
Scalar
*
lhs
,
Index
lhsStride
,
const
Scalar
*
rhs
,
Index
rhsStride
,
Scalar
*
res
,
Index
resStride
,
Scalar
alpha
)
{
product_selfadjoint_matrix
<
Scalar
,
Index
,
EIGEN_LOGICAL_XOR
(
RhsSelfAdjoint
,
RhsStorageOrder
==
RowMajor
)
?
ColMajor
:
RowMajor
,
RhsSelfAdjoint
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
RhsSelfAdjoint
,
ConjugateRhs
),
EIGEN_LOGICAL_XOR
(
LhsSelfAdjoint
,
LhsStorageOrder
==
RowMajor
)
?
ColMajor
:
RowMajor
,
LhsSelfAdjoint
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
LhsSelfAdjoint
,
ConjugateLhs
),
ColMajor
>
::
run
(
cols
,
rows
,
rhs
,
rhsStride
,
lhs
,
lhsStride
,
res
,
resStride
,
alpha
);
}
};
template
<
typename
Scalar
,
typename
Index
,
int
LhsStorageOrder
,
bool
ConjugateLhs
,
int
RhsStorageOrder
,
bool
ConjugateRhs
>
struct
product_selfadjoint_matrix
<
Scalar
,
Index
,
LhsStorageOrder
,
true
,
ConjugateLhs
,
RhsStorageOrder
,
false
,
ConjugateRhs
,
ColMajor
>
{
static
EIGEN_DONT_INLINE
void
run
(
Index
rows
,
Index
cols
,
const
Scalar
*
_lhs
,
Index
lhsStride
,
const
Scalar
*
_rhs
,
Index
rhsStride
,
Scalar
*
res
,
Index
resStride
,
Scalar
alpha
)
{
Index
size
=
rows
;
const_blas_data_mapper
<
Scalar
,
Index
,
LhsStorageOrder
>
lhs
(
_lhs
,
lhsStride
);
const_blas_data_mapper
<
Scalar
,
Index
,
RhsStorageOrder
>
rhs
(
_rhs
,
rhsStride
);
typedef
gebp_traits
<
Scalar
,
Scalar
>
Traits
;
Index
kc
=
size
;
// cache block size along the K direction
Index
mc
=
rows
;
// cache block size along the M direction
Index
nc
=
cols
;
// cache block size along the N direction
computeProductBlockingSizes
<
Scalar
,
Scalar
>
(
kc
,
mc
,
nc
);
// kc must smaller than mc
kc
=
(
std
::
min
)(
kc
,
mc
);
std
::
size_t
sizeW
=
kc
*
Traits
::
WorkSpaceFactor
;
std
::
size_t
sizeB
=
sizeW
+
kc
*
cols
;
ei_declare_aligned_stack_constructed_variable
(
Scalar
,
blockA
,
kc
*
mc
,
0
);
ei_declare_aligned_stack_constructed_variable
(
Scalar
,
allocatedBlockB
,
sizeB
,
0
);
Scalar
*
blockB
=
allocatedBlockB
+
sizeW
;
gebp_kernel
<
Scalar
,
Scalar
,
Index
,
Traits
::
mr
,
Traits
::
nr
,
ConjugateLhs
,
ConjugateRhs
>
gebp_kernel
;
symm_pack_lhs
<
Scalar
,
Index
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
>
pack_lhs
;
gemm_pack_rhs
<
Scalar
,
Index
,
Traits
::
nr
,
RhsStorageOrder
>
pack_rhs
;
gemm_pack_lhs
<
Scalar
,
Index
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
==
RowMajor
?
ColMajor
:
RowMajor
,
true
>
pack_lhs_transposed
;
for
(
Index
k2
=
0
;
k2
<
size
;
k2
+=
kc
)
{
const
Index
actual_kc
=
(
std
::
min
)(
k2
+
kc
,
size
)
-
k2
;
// we have selected one row panel of rhs and one column panel of lhs
// pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse
pack_rhs
(
blockB
,
&
rhs
(
k2
,
0
),
rhsStride
,
actual_kc
,
cols
);
// the select lhs's panel has to be split in three different parts:
// 1 - the transposed panel above the diagonal block => transposed packed copy
// 2 - the diagonal block => special packed copy
// 3 - the panel below the diagonal block => generic packed copy
for
(
Index
i2
=
0
;
i2
<
k2
;
i2
+=
mc
)
{
const
Index
actual_mc
=
(
std
::
min
)(
i2
+
mc
,
k2
)
-
i2
;
// transposed packed copy
pack_lhs_transposed
(
blockA
,
&
lhs
(
k2
,
i2
),
lhsStride
,
actual_kc
,
actual_mc
);
gebp_kernel
(
res
+
i2
,
resStride
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
);
}
// the block diagonal
{
const
Index
actual_mc
=
(
std
::
min
)(
k2
+
kc
,
size
)
-
k2
;
// symmetric packed copy
pack_lhs
(
blockA
,
&
lhs
(
k2
,
k2
),
lhsStride
,
actual_kc
,
actual_mc
);
gebp_kernel
(
res
+
k2
,
resStride
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
);
}
for
(
Index
i2
=
k2
+
kc
;
i2
<
size
;
i2
+=
mc
)
{
const
Index
actual_mc
=
(
std
::
min
)(
i2
+
mc
,
size
)
-
i2
;
gemm_pack_lhs
<
Scalar
,
Index
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
,
false
>
()
(
blockA
,
&
lhs
(
i2
,
k2
),
lhsStride
,
actual_kc
,
actual_mc
);
gebp_kernel
(
res
+
i2
,
resStride
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
);
}
}
}
};
// matrix * selfadjoint product
template
<
typename
Scalar
,
typename
Index
,
int
LhsStorageOrder
,
bool
ConjugateLhs
,
int
RhsStorageOrder
,
bool
ConjugateRhs
>
struct
product_selfadjoint_matrix
<
Scalar
,
Index
,
LhsStorageOrder
,
false
,
ConjugateLhs
,
RhsStorageOrder
,
true
,
ConjugateRhs
,
ColMajor
>
{
static
EIGEN_DONT_INLINE
void
run
(
Index
rows
,
Index
cols
,
const
Scalar
*
_lhs
,
Index
lhsStride
,
const
Scalar
*
_rhs
,
Index
rhsStride
,
Scalar
*
res
,
Index
resStride
,
Scalar
alpha
)
{
Index
size
=
cols
;
const_blas_data_mapper
<
Scalar
,
Index
,
LhsStorageOrder
>
lhs
(
_lhs
,
lhsStride
);
typedef
gebp_traits
<
Scalar
,
Scalar
>
Traits
;
Index
kc
=
size
;
// cache block size along the K direction
Index
mc
=
rows
;
// cache block size along the M direction
Index
nc
=
cols
;
// cache block size along the N direction
computeProductBlockingSizes
<
Scalar
,
Scalar
>
(
kc
,
mc
,
nc
);
std
::
size_t
sizeW
=
kc
*
Traits
::
WorkSpaceFactor
;
std
::
size_t
sizeB
=
sizeW
+
kc
*
cols
;
ei_declare_aligned_stack_constructed_variable
(
Scalar
,
blockA
,
kc
*
mc
,
0
);
ei_declare_aligned_stack_constructed_variable
(
Scalar
,
allocatedBlockB
,
sizeB
,
0
);
Scalar
*
blockB
=
allocatedBlockB
+
sizeW
;
gebp_kernel
<
Scalar
,
Scalar
,
Index
,
Traits
::
mr
,
Traits
::
nr
,
ConjugateLhs
,
ConjugateRhs
>
gebp_kernel
;
gemm_pack_lhs
<
Scalar
,
Index
,
Traits
::
mr
,
Traits
::
LhsProgress
,
LhsStorageOrder
>
pack_lhs
;
symm_pack_rhs
<
Scalar
,
Index
,
Traits
::
nr
,
RhsStorageOrder
>
pack_rhs
;
for
(
Index
k2
=
0
;
k2
<
size
;
k2
+=
kc
)
{
const
Index
actual_kc
=
(
std
::
min
)(
k2
+
kc
,
size
)
-
k2
;
pack_rhs
(
blockB
,
_rhs
,
rhsStride
,
actual_kc
,
cols
,
k2
);
// => GEPP
for
(
Index
i2
=
0
;
i2
<
rows
;
i2
+=
mc
)
{
const
Index
actual_mc
=
(
std
::
min
)(
i2
+
mc
,
rows
)
-
i2
;
pack_lhs
(
blockA
,
&
lhs
(
i2
,
k2
),
lhsStride
,
actual_kc
,
actual_mc
);
gebp_kernel
(
res
+
i2
,
resStride
,
blockA
,
blockB
,
actual_mc
,
actual_kc
,
cols
,
alpha
);
}
}
}
};
}
// end namespace internal
/***************************************************************************
* Wrapper to product_selfadjoint_matrix
***************************************************************************/
namespace
internal
{
template
<
typename
Lhs
,
int
LhsMode
,
typename
Rhs
,
int
RhsMode
>
struct
traits
<
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
RhsMode
,
false
>
>
:
traits
<
ProductBase
<
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
RhsMode
,
false
>
,
Lhs
,
Rhs
>
>
{};
}
template
<
typename
Lhs
,
int
LhsMode
,
typename
Rhs
,
int
RhsMode
>
struct
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
RhsMode
,
false
>
:
public
ProductBase
<
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
RhsMode
,
false
>
,
Lhs
,
Rhs
>
{
EIGEN_PRODUCT_PUBLIC_INTERFACE
(
SelfadjointProductMatrix
)
SelfadjointProductMatrix
(
const
Lhs
&
lhs
,
const
Rhs
&
rhs
)
:
Base
(
lhs
,
rhs
)
{}
enum
{
LhsIsUpper
=
(
LhsMode
&
(
Upper
|
Lower
))
==
Upper
,
LhsIsSelfAdjoint
=
(
LhsMode
&
SelfAdjoint
)
==
SelfAdjoint
,
RhsIsUpper
=
(
RhsMode
&
(
Upper
|
Lower
))
==
Upper
,
RhsIsSelfAdjoint
=
(
RhsMode
&
SelfAdjoint
)
==
SelfAdjoint
};
template
<
typename
Dest
>
void
scaleAndAddTo
(
Dest
&
dst
,
Scalar
alpha
)
const
{
eigen_assert
(
dst
.
rows
()
==
m_lhs
.
rows
()
&&
dst
.
cols
()
==
m_rhs
.
cols
());
typename
internal
::
add_const_on_value_type
<
ActualLhsType
>::
type
lhs
=
LhsBlasTraits
::
extract
(
m_lhs
);
typename
internal
::
add_const_on_value_type
<
ActualRhsType
>::
type
rhs
=
RhsBlasTraits
::
extract
(
m_rhs
);
Scalar
actualAlpha
=
alpha
*
LhsBlasTraits
::
extractScalarFactor
(
m_lhs
)
*
RhsBlasTraits
::
extractScalarFactor
(
m_rhs
);
internal
::
product_selfadjoint_matrix
<
Scalar
,
Index
,
EIGEN_LOGICAL_XOR
(
LhsIsUpper
,
internal
::
traits
<
Lhs
>::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
LhsIsSelfAdjoint
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
LhsIsUpper
,
bool
(
LhsBlasTraits
::
NeedToConjugate
)),
EIGEN_LOGICAL_XOR
(
RhsIsUpper
,
internal
::
traits
<
Rhs
>::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
RhsIsSelfAdjoint
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
RhsIsUpper
,
bool
(
RhsBlasTraits
::
NeedToConjugate
)),
internal
::
traits
<
Dest
>::
Flags
&
RowMajorBit
?
RowMajor
:
ColMajor
>
::
run
(
lhs
.
rows
(),
rhs
.
cols
(),
// sizes
&
lhs
.
coeffRef
(
0
,
0
),
lhs
.
outerStride
(),
// lhs info
&
rhs
.
coeffRef
(
0
,
0
),
rhs
.
outerStride
(),
// rhs info
&
dst
.
coeffRef
(
0
,
0
),
dst
.
outerStride
(),
// result info
actualAlpha
// alpha
);
}
};
}
// end namespace Eigen
#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
densecrf/include/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
0 → 100644
View file @
da2c1226
/*
Copyright (c) 2011, Intel Corporation. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors may
be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************************
* Content : Eigen bindings to Intel(R) MKL
* Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM.
********************************************************************************
*/
#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
#define EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
namespace
Eigen
{
namespace
internal
{
/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
#define EIGEN_MKL_SYMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
{\
\
static EIGEN_DONT_INLINE void run( \
Index rows, Index cols, \
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \
EIGTYPE alpha) \
{ \
char side='L', uplo='L'; \
MKL_INT m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKLTYPE alpha_, beta_; \
MatrixX##EIGPREFIX b_tmp; \
EIGTYPE myone(1);\
\
/* Set transpose options */
\
/* Set m, n, k */
\
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
\
/* Set alpha_ & beta_ */
\
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
\
/* Set lda, ldb, ldc */
\
lda = (MKL_INT)lhsStride; \
ldb = (MKL_INT)rhsStride; \
ldc = (MKL_INT)resStride; \
\
/* Set a, b, c */
\
if (LhsStorageOrder==RowMajor) uplo='U'; \
a = _lhs; \
\
if (RhsStorageOrder==RowMajor) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
b_tmp = rhs.adjoint(); \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
} else b = _rhs; \
\
MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
\
} \
};
#define EIGEN_MKL_HEMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
{\
static EIGEN_DONT_INLINE void run( \
Index rows, Index cols, \
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \
EIGTYPE alpha) \
{ \
char side='L', uplo='L'; \
MKL_INT m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKLTYPE alpha_, beta_; \
MatrixX##EIGPREFIX b_tmp; \
Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \
EIGTYPE myone(1); \
\
/* Set transpose options */
\
/* Set m, n, k */
\
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
\
/* Set alpha_ & beta_ */
\
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
\
/* Set lda, ldb, ldc */
\
lda = (MKL_INT)lhsStride; \
ldb = (MKL_INT)rhsStride; \
ldc = (MKL_INT)resStride; \
\
/* Set a, b, c */
\
if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \
Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \
a_tmp = lhs.conjugate(); \
a = a_tmp.data(); \
lda = a_tmp.outerStride(); \
} else a = _lhs; \
if (LhsStorageOrder==RowMajor) uplo='U'; \
\
if (RhsStorageOrder==ColMajor && (!ConjugateRhs)) { \
b = _rhs; } \
else { \
if (RhsStorageOrder==ColMajor && ConjugateRhs) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,m,n,OuterStride<>(rhsStride)); \
b_tmp = rhs.conjugate(); \
} else \
if (ConjugateRhs) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
b_tmp = rhs.adjoint(); \
} else { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
b_tmp = rhs.transpose(); \
} \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
} \
\
MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
\
} \
};
EIGEN_MKL_SYMM_L
(
double
,
double
,
d
,
d
)
EIGEN_MKL_SYMM_L
(
float
,
float
,
f
,
s
)
EIGEN_MKL_HEMM_L
(
dcomplex
,
MKL_Complex16
,
cd
,
z
)
EIGEN_MKL_HEMM_L
(
scomplex
,
MKL_Complex8
,
cf
,
c
)
/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
#define EIGEN_MKL_SYMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
{\
\
static EIGEN_DONT_INLINE void run( \
Index rows, Index cols, \
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \
EIGTYPE alpha) \
{ \
char side='R', uplo='L'; \
MKL_INT m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKLTYPE alpha_, beta_; \
MatrixX##EIGPREFIX b_tmp; \
EIGTYPE myone(1);\
\
/* Set m, n, k */
\
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
\
/* Set alpha_ & beta_ */
\
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
\
/* Set lda, ldb, ldc */
\
lda = (MKL_INT)rhsStride; \
ldb = (MKL_INT)lhsStride; \
ldc = (MKL_INT)resStride; \
\
/* Set a, b, c */
\
if (RhsStorageOrder==RowMajor) uplo='U'; \
a = _rhs; \
\
if (LhsStorageOrder==RowMajor) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \
b_tmp = lhs.adjoint(); \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
} else b = _lhs; \
\
MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
\
} \
};
#define EIGEN_MKL_HEMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
{\
static EIGEN_DONT_INLINE void run( \
Index rows, Index cols, \
const EIGTYPE* _lhs, Index lhsStride, \
const EIGTYPE* _rhs, Index rhsStride, \
EIGTYPE* res, Index resStride, \
EIGTYPE alpha) \
{ \
char side='R', uplo='L'; \
MKL_INT m, n, lda, ldb, ldc; \
const EIGTYPE *a, *b; \
MKLTYPE alpha_, beta_; \
MatrixX##EIGPREFIX b_tmp; \
Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \
EIGTYPE myone(1); \
\
/* Set m, n, k */
\
m = (MKL_INT)rows; \
n = (MKL_INT)cols; \
\
/* Set alpha_ & beta_ */
\
assign_scalar_eig2mkl(alpha_, alpha); \
assign_scalar_eig2mkl(beta_, myone); \
\
/* Set lda, ldb, ldc */
\
lda = (MKL_INT)rhsStride; \
ldb = (MKL_INT)lhsStride; \
ldc = (MKL_INT)resStride; \
\
/* Set a, b, c */
\
if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \
Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \
a_tmp = rhs.conjugate(); \
a = a_tmp.data(); \
lda = a_tmp.outerStride(); \
} else a = _rhs; \
if (RhsStorageOrder==RowMajor) uplo='U'; \
\
if (LhsStorageOrder==ColMajor && (!ConjugateLhs)) { \
b = _lhs; } \
else { \
if (LhsStorageOrder==ColMajor && ConjugateLhs) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,n,OuterStride<>(lhsStride)); \
b_tmp = lhs.conjugate(); \
} else \
if (ConjugateLhs) { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
b_tmp = lhs.adjoint(); \
} else { \
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
b_tmp = lhs.transpose(); \
} \
b = b_tmp.data(); \
ldb = b_tmp.outerStride(); \
} \
\
MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
} \
};
EIGEN_MKL_SYMM_R
(
double
,
double
,
d
,
d
)
EIGEN_MKL_SYMM_R
(
float
,
float
,
f
,
s
)
EIGEN_MKL_HEMM_R
(
dcomplex
,
MKL_Complex16
,
cd
,
z
)
EIGEN_MKL_HEMM_R
(
scomplex
,
MKL_Complex8
,
cf
,
c
)
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
densecrf/include/Eigen/src/Core/products/SelfadjointMatrixVector.h
0 → 100644
View file @
da2c1226
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H
namespace
Eigen
{
namespace
internal
{
/* Optimized selfadjoint matrix * vector product:
* This algorithm processes 2 columns at onces that allows to both reduce
* the number of load/stores of the result by a factor 2 and to reduce
* the instruction dependency.
*/
template
<
typename
Scalar
,
typename
Index
,
int
StorageOrder
,
int
UpLo
,
bool
ConjugateLhs
,
bool
ConjugateRhs
,
int
Version
=
Specialized
>
struct
selfadjoint_matrix_vector_product
;
template
<
typename
Scalar
,
typename
Index
,
int
StorageOrder
,
int
UpLo
,
bool
ConjugateLhs
,
bool
ConjugateRhs
,
int
Version
>
struct
selfadjoint_matrix_vector_product
{
static
EIGEN_DONT_INLINE
void
run
(
Index
size
,
const
Scalar
*
lhs
,
Index
lhsStride
,
const
Scalar
*
_rhs
,
Index
rhsIncr
,
Scalar
*
res
,
Scalar
alpha
)
{
typedef
typename
packet_traits
<
Scalar
>::
type
Packet
;
typedef
typename
NumTraits
<
Scalar
>::
Real
RealScalar
;
const
Index
PacketSize
=
sizeof
(
Packet
)
/
sizeof
(
Scalar
);
enum
{
IsRowMajor
=
StorageOrder
==
RowMajor
?
1
:
0
,
IsLower
=
UpLo
==
Lower
?
1
:
0
,
FirstTriangular
=
IsRowMajor
==
IsLower
};
conj_helper
<
Scalar
,
Scalar
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
ConjugateLhs
,
IsRowMajor
),
ConjugateRhs
>
cj0
;
conj_helper
<
Scalar
,
Scalar
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
ConjugateLhs
,
!
IsRowMajor
),
ConjugateRhs
>
cj1
;
conj_helper
<
Scalar
,
Scalar
,
NumTraits
<
Scalar
>::
IsComplex
,
ConjugateRhs
>
cjd
;
conj_helper
<
Packet
,
Packet
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
ConjugateLhs
,
IsRowMajor
),
ConjugateRhs
>
pcj0
;
conj_helper
<
Packet
,
Packet
,
NumTraits
<
Scalar
>::
IsComplex
&&
EIGEN_LOGICAL_XOR
(
ConjugateLhs
,
!
IsRowMajor
),
ConjugateRhs
>
pcj1
;
Scalar
cjAlpha
=
ConjugateRhs
?
conj
(
alpha
)
:
alpha
;
// FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed.
// if the rhs is not sequentially stored in memory we copy it to a temporary buffer,
// this is because we need to extract packets
ei_declare_aligned_stack_constructed_variable
(
Scalar
,
rhs
,
size
,
rhsIncr
==
1
?
const_cast
<
Scalar
*>
(
_rhs
)
:
0
);
if
(
rhsIncr
!=
1
)
{
const
Scalar
*
it
=
_rhs
;
for
(
Index
i
=
0
;
i
<
size
;
++
i
,
it
+=
rhsIncr
)
rhs
[
i
]
=
*
it
;
}
Index
bound
=
(
std
::
max
)(
Index
(
0
),
size
-
8
)
&
0xfffffffe
;
if
(
FirstTriangular
)
bound
=
size
-
bound
;
for
(
Index
j
=
FirstTriangular
?
bound
:
0
;
j
<
(
FirstTriangular
?
size
:
bound
);
j
+=
2
)
{
register
const
Scalar
*
EIGEN_RESTRICT
A0
=
lhs
+
j
*
lhsStride
;
register
const
Scalar
*
EIGEN_RESTRICT
A1
=
lhs
+
(
j
+
1
)
*
lhsStride
;
Scalar
t0
=
cjAlpha
*
rhs
[
j
];
Packet
ptmp0
=
pset1
<
Packet
>
(
t0
);
Scalar
t1
=
cjAlpha
*
rhs
[
j
+
1
];
Packet
ptmp1
=
pset1
<
Packet
>
(
t1
);
Scalar
t2
(
0
);
Packet
ptmp2
=
pset1
<
Packet
>
(
t2
);
Scalar
t3
(
0
);
Packet
ptmp3
=
pset1
<
Packet
>
(
t3
);
size_t
starti
=
FirstTriangular
?
0
:
j
+
2
;
size_t
endi
=
FirstTriangular
?
j
:
size
;
size_t
alignedStart
=
(
starti
)
+
internal
::
first_aligned
(
&
res
[
starti
],
endi
-
starti
);
size_t
alignedEnd
=
alignedStart
+
((
endi
-
alignedStart
)
/
(
PacketSize
))
*
(
PacketSize
);
// TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
res
[
j
]
+=
cjd
.
pmul
(
internal
::
real
(
A0
[
j
]),
t0
);
res
[
j
+
1
]
+=
cjd
.
pmul
(
internal
::
real
(
A1
[
j
+
1
]),
t1
);
if
(
FirstTriangular
)
{
res
[
j
]
+=
cj0
.
pmul
(
A1
[
j
],
t1
);
t3
+=
cj1
.
pmul
(
A1
[
j
],
rhs
[
j
]);
}
else
{
res
[
j
+
1
]
+=
cj0
.
pmul
(
A0
[
j
+
1
],
t0
);
t2
+=
cj1
.
pmul
(
A0
[
j
+
1
],
rhs
[
j
+
1
]);
}
for
(
size_t
i
=
starti
;
i
<
alignedStart
;
++
i
)
{
res
[
i
]
+=
t0
*
A0
[
i
]
+
t1
*
A1
[
i
];
t2
+=
conj
(
A0
[
i
])
*
rhs
[
i
];
t3
+=
conj
(
A1
[
i
])
*
rhs
[
i
];
}
// Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
// gcc 4.2 does this optimization automatically.
const
Scalar
*
EIGEN_RESTRICT
a0It
=
A0
+
alignedStart
;
const
Scalar
*
EIGEN_RESTRICT
a1It
=
A1
+
alignedStart
;
const
Scalar
*
EIGEN_RESTRICT
rhsIt
=
rhs
+
alignedStart
;
Scalar
*
EIGEN_RESTRICT
resIt
=
res
+
alignedStart
;
for
(
size_t
i
=
alignedStart
;
i
<
alignedEnd
;
i
+=
PacketSize
)
{
Packet
A0i
=
ploadu
<
Packet
>
(
a0It
);
a0It
+=
PacketSize
;
Packet
A1i
=
ploadu
<
Packet
>
(
a1It
);
a1It
+=
PacketSize
;
Packet
Bi
=
ploadu
<
Packet
>
(
rhsIt
);
rhsIt
+=
PacketSize
;
// FIXME should be aligned in most cases
Packet
Xi
=
pload
<
Packet
>
(
resIt
);
Xi
=
pcj0
.
pmadd
(
A0i
,
ptmp0
,
pcj0
.
pmadd
(
A1i
,
ptmp1
,
Xi
));
ptmp2
=
pcj1
.
pmadd
(
A0i
,
Bi
,
ptmp2
);
ptmp3
=
pcj1
.
pmadd
(
A1i
,
Bi
,
ptmp3
);
pstore
(
resIt
,
Xi
);
resIt
+=
PacketSize
;
}
for
(
size_t
i
=
alignedEnd
;
i
<
endi
;
i
++
)
{
res
[
i
]
+=
cj0
.
pmul
(
A0
[
i
],
t0
)
+
cj0
.
pmul
(
A1
[
i
],
t1
);
t2
+=
cj1
.
pmul
(
A0
[
i
],
rhs
[
i
]);
t3
+=
cj1
.
pmul
(
A1
[
i
],
rhs
[
i
]);
}
res
[
j
]
+=
alpha
*
(
t2
+
predux
(
ptmp2
));
res
[
j
+
1
]
+=
alpha
*
(
t3
+
predux
(
ptmp3
));
}
for
(
Index
j
=
FirstTriangular
?
0
:
bound
;
j
<
(
FirstTriangular
?
bound
:
size
);
j
++
)
{
register
const
Scalar
*
EIGEN_RESTRICT
A0
=
lhs
+
j
*
lhsStride
;
Scalar
t1
=
cjAlpha
*
rhs
[
j
];
Scalar
t2
(
0
);
// TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
res
[
j
]
+=
cjd
.
pmul
(
internal
::
real
(
A0
[
j
]),
t1
);
for
(
Index
i
=
FirstTriangular
?
0
:
j
+
1
;
i
<
(
FirstTriangular
?
j
:
size
);
i
++
)
{
res
[
i
]
+=
cj0
.
pmul
(
A0
[
i
],
t1
);
t2
+=
cj1
.
pmul
(
A0
[
i
],
rhs
[
i
]);
}
res
[
j
]
+=
alpha
*
t2
;
}
}
};
}
// end namespace internal
/***************************************************************************
* Wrapper to product_selfadjoint_vector
***************************************************************************/
namespace
internal
{
template
<
typename
Lhs
,
int
LhsMode
,
typename
Rhs
>
struct
traits
<
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
0
,
true
>
>
:
traits
<
ProductBase
<
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
0
,
true
>
,
Lhs
,
Rhs
>
>
{};
}
template
<
typename
Lhs
,
int
LhsMode
,
typename
Rhs
>
struct
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
0
,
true
>
:
public
ProductBase
<
SelfadjointProductMatrix
<
Lhs
,
LhsMode
,
false
,
Rhs
,
0
,
true
>
,
Lhs
,
Rhs
>
{
EIGEN_PRODUCT_PUBLIC_INTERFACE
(
SelfadjointProductMatrix
)
enum
{
LhsUpLo
=
LhsMode
&
(
Upper
|
Lower
)
};
SelfadjointProductMatrix
(
const
Lhs
&
lhs
,
const
Rhs
&
rhs
)
:
Base
(
lhs
,
rhs
)
{}
template
<
typename
Dest
>
void
scaleAndAddTo
(
Dest
&
dest
,
Scalar
alpha
)
const
{
typedef
typename
Dest
::
Scalar
ResScalar
;
typedef
typename
Base
::
RhsScalar
RhsScalar
;
typedef
Map
<
Matrix
<
ResScalar
,
Dynamic
,
1
>
,
Aligned
>
MappedDest
;
eigen_assert
(
dest
.
rows
()
==
m_lhs
.
rows
()
&&
dest
.
cols
()
==
m_rhs
.
cols
());
typename
internal
::
add_const_on_value_type
<
ActualLhsType
>::
type
lhs
=
LhsBlasTraits
::
extract
(
m_lhs
);
typename
internal
::
add_const_on_value_type
<
ActualRhsType
>::
type
rhs
=
RhsBlasTraits
::
extract
(
m_rhs
);
Scalar
actualAlpha
=
alpha
*
LhsBlasTraits
::
extractScalarFactor
(
m_lhs
)
*
RhsBlasTraits
::
extractScalarFactor
(
m_rhs
);
enum
{
EvalToDest
=
(
Dest
::
InnerStrideAtCompileTime
==
1
),
UseRhs
=
(
_ActualRhsType
::
InnerStrideAtCompileTime
==
1
)
};
internal
::
gemv_static_vector_if
<
ResScalar
,
Dest
::
SizeAtCompileTime
,
Dest
::
MaxSizeAtCompileTime
,
!
EvalToDest
>
static_dest
;
internal
::
gemv_static_vector_if
<
RhsScalar
,
_ActualRhsType
::
SizeAtCompileTime
,
_ActualRhsType
::
MaxSizeAtCompileTime
,
!
UseRhs
>
static_rhs
;
ei_declare_aligned_stack_constructed_variable
(
ResScalar
,
actualDestPtr
,
dest
.
size
(),
EvalToDest
?
dest
.
data
()
:
static_dest
.
data
());
ei_declare_aligned_stack_constructed_variable
(
RhsScalar
,
actualRhsPtr
,
rhs
.
size
(),
UseRhs
?
const_cast
<
RhsScalar
*>
(
rhs
.
data
())
:
static_rhs
.
data
());
if
(
!
EvalToDest
)
{
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
int
size
=
dest
.
size
();
EIGEN_DENSE_STORAGE_CTOR_PLUGIN
#endif
MappedDest
(
actualDestPtr
,
dest
.
size
())
=
dest
;
}
if
(
!
UseRhs
)
{
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
int
size
=
rhs
.
size
();
EIGEN_DENSE_STORAGE_CTOR_PLUGIN
#endif
Map
<
typename
_ActualRhsType
::
PlainObject
>
(
actualRhsPtr
,
rhs
.
size
())
=
rhs
;
}
internal
::
selfadjoint_matrix_vector_product
<
Scalar
,
Index
,
(
internal
::
traits
<
_ActualLhsType
>::
Flags
&
RowMajorBit
)
?
RowMajor
:
ColMajor
,
int
(
LhsUpLo
),
bool
(
LhsBlasTraits
::
NeedToConjugate
),
bool
(
RhsBlasTraits
::
NeedToConjugate
)
>::
run
(
lhs
.
rows
(),
// size
&
lhs
.
coeffRef
(
0
,
0
),
lhs
.
outerStride
(),
// lhs info
actualRhsPtr
,
1
,
// rhs info
actualDestPtr
,
// result info
actualAlpha
// scale factor
);
if
(
!
EvalToDest
)
dest
=
MappedDest
(
actualDestPtr
,
dest
.
size
());
}
};
namespace
internal
{
template
<
typename
Lhs
,
typename
Rhs
,
int
RhsMode
>
struct
traits
<
SelfadjointProductMatrix
<
Lhs
,
0
,
true
,
Rhs
,
RhsMode
,
false
>
>
:
traits
<
ProductBase
<
SelfadjointProductMatrix
<
Lhs
,
0
,
true
,
Rhs
,
RhsMode
,
false
>
,
Lhs
,
Rhs
>
>
{};
}
template
<
typename
Lhs
,
typename
Rhs
,
int
RhsMode
>
struct
SelfadjointProductMatrix
<
Lhs
,
0
,
true
,
Rhs
,
RhsMode
,
false
>
:
public
ProductBase
<
SelfadjointProductMatrix
<
Lhs
,
0
,
true
,
Rhs
,
RhsMode
,
false
>
,
Lhs
,
Rhs
>
{
EIGEN_PRODUCT_PUBLIC_INTERFACE
(
SelfadjointProductMatrix
)
enum
{
RhsUpLo
=
RhsMode
&
(
Upper
|
Lower
)
};
SelfadjointProductMatrix
(
const
Lhs
&
lhs
,
const
Rhs
&
rhs
)
:
Base
(
lhs
,
rhs
)
{}
template
<
typename
Dest
>
void
scaleAndAddTo
(
Dest
&
dest
,
Scalar
alpha
)
const
{
// let's simply transpose the product
Transpose
<
Dest
>
destT
(
dest
);
SelfadjointProductMatrix
<
Transpose
<
const
Rhs
>
,
int
(
RhsUpLo
)
==
Upper
?
Lower
:
Upper
,
false
,
Transpose
<
const
Lhs
>
,
0
,
true
>
(
m_rhs
.
transpose
(),
m_lhs
.
transpose
()).
scaleAndAddTo
(
destT
,
alpha
);
}
};
}
// end namespace Eigen
#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
Prev
1
…
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment