Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
pydensecrf
Commits
13b115ab
Commit
13b115ab
authored
Mar 08, 2021
by
Simon Tulling
Browse files
Fix Windows MSVC install by updating Eigen Library
parent
4d5343c3
Changes
157
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
7972 additions
and
342 deletions
+7972
-342
pydensecrf/densecrf/include/Eigen/src/Core/arch/AVX/Complex.h
...nsecrf/densecrf/include/Eigen/src/Core/arch/AVX/Complex.h
+451
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/AVX/MathFunctions.h
.../densecrf/include/Eigen/src/Core/arch/AVX/MathFunctions.h
+439
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/AVX/PacketMath.h
...crf/densecrf/include/Eigen/src/Core/arch/AVX/PacketMath.h
+637
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/AVX/TypeCasting.h
...rf/densecrf/include/Eigen/src/Core/arch/AVX/TypeCasting.h
+51
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/AVX512/MathFunctions.h
...nsecrf/include/Eigen/src/Core/arch/AVX512/MathFunctions.h
+389
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/AVX512/PacketMath.h
.../densecrf/include/Eigen/src/Core/arch/AVX512/PacketMath.h
+1305
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/AltiVec/Complex.h
...rf/densecrf/include/Eigen/src/Core/arch/AltiVec/Complex.h
+268
-55
pydensecrf/densecrf/include/Eigen/src/Core/arch/AltiVec/MathFunctions.h
...secrf/include/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+322
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/AltiVec/PacketMath.h
...densecrf/include/Eigen/src/Core/arch/AltiVec/PacketMath.h
+694
-131
pydensecrf/densecrf/include/Eigen/src/Core/arch/CUDA/Complex.h
...secrf/densecrf/include/Eigen/src/Core/arch/CUDA/Complex.h
+103
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/CUDA/Half.h
pydensecrf/densecrf/include/Eigen/src/Core/arch/CUDA/Half.h
+675
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/CUDA/MathFunctions.h
...densecrf/include/Eigen/src/Core/arch/CUDA/MathFunctions.h
+91
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/CUDA/PacketMath.h
...rf/densecrf/include/Eigen/src/Core/arch/CUDA/PacketMath.h
+333
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
...ensecrf/include/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+1124
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/CUDA/TypeCasting.h
...f/densecrf/include/Eigen/src/Core/arch/CUDA/TypeCasting.h
+212
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/Default/ConjHelper.h
...densecrf/include/Eigen/src/Core/arch/Default/ConjHelper.h
+29
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/NEON/Complex.h
...secrf/densecrf/include/Eigen/src/Core/arch/NEON/Complex.h
+251
-20
pydensecrf/densecrf/include/Eigen/src/Core/arch/NEON/MathFunctions.h
...densecrf/include/Eigen/src/Core/arch/NEON/MathFunctions.h
+91
-0
pydensecrf/densecrf/include/Eigen/src/Core/arch/NEON/PacketMath.h
...rf/densecrf/include/Eigen/src/Core/arch/NEON/PacketMath.h
+415
-79
pydensecrf/densecrf/include/Eigen/src/Core/arch/SSE/Complex.h
...nsecrf/densecrf/include/Eigen/src/Core/arch/SSE/Complex.h
+92
-57
No files found.
Too many changes to show.
To preserve performance only
157 of 157+
files are displayed.
Plain diff
Email patch
pydensecrf/densecrf/include/Eigen/src/Core/arch/AVX/Complex.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_AVX_H
#define EIGEN_COMPLEX_AVX_H
namespace
Eigen
{
namespace
internal
{
//---------- float ----------
struct
Packet4cf
{
EIGEN_STRONG_INLINE
Packet4cf
()
{}
EIGEN_STRONG_INLINE
explicit
Packet4cf
(
const
__m256
&
a
)
:
v
(
a
)
{}
__m256
v
;
};
template
<
>
struct
packet_traits
<
std
::
complex
<
float
>
>
:
default_packet_traits
{
typedef
Packet4cf
type
;
typedef
Packet2cf
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
,
HasHalfPacket
=
1
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasNegate
=
1
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasSetLinear
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet4cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
4
,
alignment
=
Aligned32
};
typedef
Packet2cf
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
padd
<
Packet4cf
>
(
const
Packet4cf
&
a
,
const
Packet4cf
&
b
)
{
return
Packet4cf
(
_mm256_add_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
psub
<
Packet4cf
>
(
const
Packet4cf
&
a
,
const
Packet4cf
&
b
)
{
return
Packet4cf
(
_mm256_sub_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
pnegate
(
const
Packet4cf
&
a
)
{
return
Packet4cf
(
pnegate
(
a
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
pconj
(
const
Packet4cf
&
a
)
{
const
__m256
mask
=
_mm256_castsi256_ps
(
_mm256_setr_epi32
(
0x00000000
,
0x80000000
,
0x00000000
,
0x80000000
,
0x00000000
,
0x80000000
,
0x00000000
,
0x80000000
));
return
Packet4cf
(
_mm256_xor_ps
(
a
.
v
,
mask
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
pmul
<
Packet4cf
>
(
const
Packet4cf
&
a
,
const
Packet4cf
&
b
)
{
__m256
tmp1
=
_mm256_mul_ps
(
_mm256_moveldup_ps
(
a
.
v
),
b
.
v
);
__m256
tmp2
=
_mm256_mul_ps
(
_mm256_movehdup_ps
(
a
.
v
),
_mm256_permute_ps
(
b
.
v
,
_MM_SHUFFLE
(
2
,
3
,
0
,
1
)));
__m256
result
=
_mm256_addsub_ps
(
tmp1
,
tmp2
);
return
Packet4cf
(
result
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
pand
<
Packet4cf
>
(
const
Packet4cf
&
a
,
const
Packet4cf
&
b
)
{
return
Packet4cf
(
_mm256_and_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
por
<
Packet4cf
>
(
const
Packet4cf
&
a
,
const
Packet4cf
&
b
)
{
return
Packet4cf
(
_mm256_or_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
pxor
<
Packet4cf
>
(
const
Packet4cf
&
a
,
const
Packet4cf
&
b
)
{
return
Packet4cf
(
_mm256_xor_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
pandnot
<
Packet4cf
>
(
const
Packet4cf
&
a
,
const
Packet4cf
&
b
)
{
return
Packet4cf
(
_mm256_andnot_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
pload
<
Packet4cf
>
(
const
std
::
complex
<
float
>*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
Packet4cf
(
pload
<
Packet8f
>
(
&
numext
::
real_ref
(
*
from
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
ploadu
<
Packet4cf
>
(
const
std
::
complex
<
float
>*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
Packet4cf
(
ploadu
<
Packet8f
>
(
&
numext
::
real_ref
(
*
from
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
pset1
<
Packet4cf
>
(
const
std
::
complex
<
float
>&
from
)
{
return
Packet4cf
(
_mm256_castpd_ps
(
_mm256_broadcast_sd
((
const
double
*
)(
const
void
*
)
&
from
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
ploaddup
<
Packet4cf
>
(
const
std
::
complex
<
float
>*
from
)
{
// FIXME The following might be optimized using _mm256_movedup_pd
Packet2cf
a
=
ploaddup
<
Packet2cf
>
(
from
);
Packet2cf
b
=
ploaddup
<
Packet2cf
>
(
from
+
1
);
return
Packet4cf
(
_mm256_insertf128_ps
(
_mm256_castps128_ps256
(
a
.
v
),
b
.
v
,
1
));
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>*
to
,
const
Packet4cf
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
(
&
numext
::
real_ref
(
*
to
),
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>*
to
,
const
Packet4cf
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
(
&
numext
::
real_ref
(
*
to
),
from
.
v
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet4cf
pgather
<
std
::
complex
<
float
>
,
Packet4cf
>
(
const
std
::
complex
<
float
>*
from
,
Index
stride
)
{
return
Packet4cf
(
_mm256_set_ps
(
std
::
imag
(
from
[
3
*
stride
]),
std
::
real
(
from
[
3
*
stride
]),
std
::
imag
(
from
[
2
*
stride
]),
std
::
real
(
from
[
2
*
stride
]),
std
::
imag
(
from
[
1
*
stride
]),
std
::
real
(
from
[
1
*
stride
]),
std
::
imag
(
from
[
0
*
stride
]),
std
::
real
(
from
[
0
*
stride
])));
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
std
::
complex
<
float
>
,
Packet4cf
>
(
std
::
complex
<
float
>*
to
,
const
Packet4cf
&
from
,
Index
stride
)
{
__m128
low
=
_mm256_extractf128_ps
(
from
.
v
,
0
);
to
[
stride
*
0
]
=
std
::
complex
<
float
>
(
_mm_cvtss_f32
(
_mm_shuffle_ps
(
low
,
low
,
0
)),
_mm_cvtss_f32
(
_mm_shuffle_ps
(
low
,
low
,
1
)));
to
[
stride
*
1
]
=
std
::
complex
<
float
>
(
_mm_cvtss_f32
(
_mm_shuffle_ps
(
low
,
low
,
2
)),
_mm_cvtss_f32
(
_mm_shuffle_ps
(
low
,
low
,
3
)));
__m128
high
=
_mm256_extractf128_ps
(
from
.
v
,
1
);
to
[
stride
*
2
]
=
std
::
complex
<
float
>
(
_mm_cvtss_f32
(
_mm_shuffle_ps
(
high
,
high
,
0
)),
_mm_cvtss_f32
(
_mm_shuffle_ps
(
high
,
high
,
1
)));
to
[
stride
*
3
]
=
std
::
complex
<
float
>
(
_mm_cvtss_f32
(
_mm_shuffle_ps
(
high
,
high
,
2
)),
_mm_cvtss_f32
(
_mm_shuffle_ps
(
high
,
high
,
3
)));
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
pfirst
<
Packet4cf
>
(
const
Packet4cf
&
a
)
{
return
pfirst
(
Packet2cf
(
_mm256_castps256_ps128
(
a
.
v
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
preverse
(
const
Packet4cf
&
a
)
{
__m128
low
=
_mm256_extractf128_ps
(
a
.
v
,
0
);
__m128
high
=
_mm256_extractf128_ps
(
a
.
v
,
1
);
__m128d
lowd
=
_mm_castps_pd
(
low
);
__m128d
highd
=
_mm_castps_pd
(
high
);
low
=
_mm_castpd_ps
(
_mm_shuffle_pd
(
lowd
,
lowd
,
0x1
));
high
=
_mm_castpd_ps
(
_mm_shuffle_pd
(
highd
,
highd
,
0x1
));
__m256
result
=
_mm256_setzero_ps
();
result
=
_mm256_insertf128_ps
(
result
,
low
,
1
);
result
=
_mm256_insertf128_ps
(
result
,
high
,
0
);
return
Packet4cf
(
result
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
predux
<
Packet4cf
>
(
const
Packet4cf
&
a
)
{
return
predux
(
padd
(
Packet2cf
(
_mm256_extractf128_ps
(
a
.
v
,
0
)),
Packet2cf
(
_mm256_extractf128_ps
(
a
.
v
,
1
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
preduxp
<
Packet4cf
>
(
const
Packet4cf
*
vecs
)
{
Packet8f
t0
=
_mm256_shuffle_ps
(
vecs
[
0
].
v
,
vecs
[
0
].
v
,
_MM_SHUFFLE
(
3
,
1
,
2
,
0
));
Packet8f
t1
=
_mm256_shuffle_ps
(
vecs
[
1
].
v
,
vecs
[
1
].
v
,
_MM_SHUFFLE
(
3
,
1
,
2
,
0
));
t0
=
_mm256_hadd_ps
(
t0
,
t1
);
Packet8f
t2
=
_mm256_shuffle_ps
(
vecs
[
2
].
v
,
vecs
[
2
].
v
,
_MM_SHUFFLE
(
3
,
1
,
2
,
0
));
Packet8f
t3
=
_mm256_shuffle_ps
(
vecs
[
3
].
v
,
vecs
[
3
].
v
,
_MM_SHUFFLE
(
3
,
1
,
2
,
0
));
t2
=
_mm256_hadd_ps
(
t2
,
t3
);
t1
=
_mm256_permute2f128_ps
(
t0
,
t2
,
0
+
(
2
<<
4
));
t3
=
_mm256_permute2f128_ps
(
t0
,
t2
,
1
+
(
3
<<
4
));
return
Packet4cf
(
_mm256_add_ps
(
t1
,
t3
));
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
predux_mul
<
Packet4cf
>
(
const
Packet4cf
&
a
)
{
return
predux_mul
(
pmul
(
Packet2cf
(
_mm256_extractf128_ps
(
a
.
v
,
0
)),
Packet2cf
(
_mm256_extractf128_ps
(
a
.
v
,
1
))));
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet4cf
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet4cf
&
first
,
const
Packet4cf
&
second
)
{
if
(
Offset
==
0
)
return
;
palign_impl
<
Offset
*
2
,
Packet8f
>::
run
(
first
.
v
,
second
.
v
);
}
};
template
<
>
struct
conj_helper
<
Packet4cf
,
Packet4cf
,
false
,
true
>
{
EIGEN_STRONG_INLINE
Packet4cf
pmadd
(
const
Packet4cf
&
x
,
const
Packet4cf
&
y
,
const
Packet4cf
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet4cf
pmul
(
const
Packet4cf
&
a
,
const
Packet4cf
&
b
)
const
{
return
internal
::
pmul
(
a
,
pconj
(
b
));
}
};
template
<
>
struct
conj_helper
<
Packet4cf
,
Packet4cf
,
true
,
false
>
{
EIGEN_STRONG_INLINE
Packet4cf
pmadd
(
const
Packet4cf
&
x
,
const
Packet4cf
&
y
,
const
Packet4cf
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet4cf
pmul
(
const
Packet4cf
&
a
,
const
Packet4cf
&
b
)
const
{
return
internal
::
pmul
(
pconj
(
a
),
b
);
}
};
template
<
>
struct
conj_helper
<
Packet4cf
,
Packet4cf
,
true
,
true
>
{
EIGEN_STRONG_INLINE
Packet4cf
pmadd
(
const
Packet4cf
&
x
,
const
Packet4cf
&
y
,
const
Packet4cf
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet4cf
pmul
(
const
Packet4cf
&
a
,
const
Packet4cf
&
b
)
const
{
return
pconj
(
internal
::
pmul
(
a
,
b
));
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL
(
Packet4cf
,
Packet8f
)
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
pdiv
<
Packet4cf
>
(
const
Packet4cf
&
a
,
const
Packet4cf
&
b
)
{
Packet4cf
num
=
pmul
(
a
,
pconj
(
b
));
__m256
tmp
=
_mm256_mul_ps
(
b
.
v
,
b
.
v
);
__m256
tmp2
=
_mm256_shuffle_ps
(
tmp
,
tmp
,
0xB1
);
__m256
denom
=
_mm256_add_ps
(
tmp
,
tmp2
);
return
Packet4cf
(
_mm256_div_ps
(
num
.
v
,
denom
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
pcplxflip
<
Packet4cf
>
(
const
Packet4cf
&
x
)
{
return
Packet4cf
(
_mm256_shuffle_ps
(
x
.
v
,
x
.
v
,
_MM_SHUFFLE
(
2
,
3
,
0
,
1
)));
}
//---------- double ----------
struct
Packet2cd
{
EIGEN_STRONG_INLINE
Packet2cd
()
{}
EIGEN_STRONG_INLINE
explicit
Packet2cd
(
const
__m256d
&
a
)
:
v
(
a
)
{}
__m256d
v
;
};
template
<
>
struct
packet_traits
<
std
::
complex
<
double
>
>
:
default_packet_traits
{
typedef
Packet2cd
type
;
typedef
Packet1cd
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
0
,
size
=
2
,
HasHalfPacket
=
1
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasNegate
=
1
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasSetLinear
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet2cd
>
{
typedef
std
::
complex
<
double
>
type
;
enum
{
size
=
2
,
alignment
=
Aligned32
};
typedef
Packet1cd
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
padd
<
Packet2cd
>
(
const
Packet2cd
&
a
,
const
Packet2cd
&
b
)
{
return
Packet2cd
(
_mm256_add_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
psub
<
Packet2cd
>
(
const
Packet2cd
&
a
,
const
Packet2cd
&
b
)
{
return
Packet2cd
(
_mm256_sub_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
pnegate
(
const
Packet2cd
&
a
)
{
return
Packet2cd
(
pnegate
(
a
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
pconj
(
const
Packet2cd
&
a
)
{
const
__m256d
mask
=
_mm256_castsi256_pd
(
_mm256_set_epi32
(
0x80000000
,
0x0
,
0x0
,
0x0
,
0x80000000
,
0x0
,
0x0
,
0x0
));
return
Packet2cd
(
_mm256_xor_pd
(
a
.
v
,
mask
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
pmul
<
Packet2cd
>
(
const
Packet2cd
&
a
,
const
Packet2cd
&
b
)
{
__m256d
tmp1
=
_mm256_shuffle_pd
(
a
.
v
,
a
.
v
,
0x0
);
__m256d
even
=
_mm256_mul_pd
(
tmp1
,
b
.
v
);
__m256d
tmp2
=
_mm256_shuffle_pd
(
a
.
v
,
a
.
v
,
0xF
);
__m256d
tmp3
=
_mm256_shuffle_pd
(
b
.
v
,
b
.
v
,
0x5
);
__m256d
odd
=
_mm256_mul_pd
(
tmp2
,
tmp3
);
return
Packet2cd
(
_mm256_addsub_pd
(
even
,
odd
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
pand
<
Packet2cd
>
(
const
Packet2cd
&
a
,
const
Packet2cd
&
b
)
{
return
Packet2cd
(
_mm256_and_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
por
<
Packet2cd
>
(
const
Packet2cd
&
a
,
const
Packet2cd
&
b
)
{
return
Packet2cd
(
_mm256_or_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
pxor
<
Packet2cd
>
(
const
Packet2cd
&
a
,
const
Packet2cd
&
b
)
{
return
Packet2cd
(
_mm256_xor_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
pandnot
<
Packet2cd
>
(
const
Packet2cd
&
a
,
const
Packet2cd
&
b
)
{
return
Packet2cd
(
_mm256_andnot_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
pload
<
Packet2cd
>
(
const
std
::
complex
<
double
>*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
Packet2cd
(
pload
<
Packet4d
>
((
const
double
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
ploadu
<
Packet2cd
>
(
const
std
::
complex
<
double
>*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
Packet2cd
(
ploadu
<
Packet4d
>
((
const
double
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
pset1
<
Packet2cd
>
(
const
std
::
complex
<
double
>&
from
)
{
// in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
// return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
return
Packet2cd
(
_mm256_broadcast_pd
((
const
__m128d
*
)(
const
void
*
)
&
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
ploaddup
<
Packet2cd
>
(
const
std
::
complex
<
double
>*
from
)
{
return
pset1
<
Packet2cd
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet2cd
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet2cd
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet2cd
pgather
<
std
::
complex
<
double
>
,
Packet2cd
>
(
const
std
::
complex
<
double
>*
from
,
Index
stride
)
{
return
Packet2cd
(
_mm256_set_pd
(
std
::
imag
(
from
[
1
*
stride
]),
std
::
real
(
from
[
1
*
stride
]),
std
::
imag
(
from
[
0
*
stride
]),
std
::
real
(
from
[
0
*
stride
])));
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
std
::
complex
<
double
>
,
Packet2cd
>
(
std
::
complex
<
double
>*
to
,
const
Packet2cd
&
from
,
Index
stride
)
{
__m128d
low
=
_mm256_extractf128_pd
(
from
.
v
,
0
);
to
[
stride
*
0
]
=
std
::
complex
<
double
>
(
_mm_cvtsd_f64
(
low
),
_mm_cvtsd_f64
(
_mm_shuffle_pd
(
low
,
low
,
1
)));
__m128d
high
=
_mm256_extractf128_pd
(
from
.
v
,
1
);
to
[
stride
*
1
]
=
std
::
complex
<
double
>
(
_mm_cvtsd_f64
(
high
),
_mm_cvtsd_f64
(
_mm_shuffle_pd
(
high
,
high
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
pfirst
<
Packet2cd
>
(
const
Packet2cd
&
a
)
{
__m128d
low
=
_mm256_extractf128_pd
(
a
.
v
,
0
);
EIGEN_ALIGN16
double
res
[
2
];
_mm_store_pd
(
res
,
low
);
return
std
::
complex
<
double
>
(
res
[
0
],
res
[
1
]);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
preverse
(
const
Packet2cd
&
a
)
{
__m256d
result
=
_mm256_permute2f128_pd
(
a
.
v
,
a
.
v
,
1
);
return
Packet2cd
(
result
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
predux
<
Packet2cd
>
(
const
Packet2cd
&
a
)
{
return
predux
(
padd
(
Packet1cd
(
_mm256_extractf128_pd
(
a
.
v
,
0
)),
Packet1cd
(
_mm256_extractf128_pd
(
a
.
v
,
1
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
preduxp
<
Packet2cd
>
(
const
Packet2cd
*
vecs
)
{
Packet4d
t0
=
_mm256_permute2f128_pd
(
vecs
[
0
].
v
,
vecs
[
1
].
v
,
0
+
(
2
<<
4
));
Packet4d
t1
=
_mm256_permute2f128_pd
(
vecs
[
0
].
v
,
vecs
[
1
].
v
,
1
+
(
3
<<
4
));
return
Packet2cd
(
_mm256_add_pd
(
t0
,
t1
));
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
predux_mul
<
Packet2cd
>
(
const
Packet2cd
&
a
)
{
return
predux
(
pmul
(
Packet1cd
(
_mm256_extractf128_pd
(
a
.
v
,
0
)),
Packet1cd
(
_mm256_extractf128_pd
(
a
.
v
,
1
))));
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet2cd
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet2cd
&
first
,
const
Packet2cd
&
second
)
{
if
(
Offset
==
0
)
return
;
palign_impl
<
Offset
*
2
,
Packet4d
>::
run
(
first
.
v
,
second
.
v
);
}
};
template
<
>
struct
conj_helper
<
Packet2cd
,
Packet2cd
,
false
,
true
>
{
EIGEN_STRONG_INLINE
Packet2cd
pmadd
(
const
Packet2cd
&
x
,
const
Packet2cd
&
y
,
const
Packet2cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet2cd
pmul
(
const
Packet2cd
&
a
,
const
Packet2cd
&
b
)
const
{
return
internal
::
pmul
(
a
,
pconj
(
b
));
}
};
template
<
>
struct
conj_helper
<
Packet2cd
,
Packet2cd
,
true
,
false
>
{
EIGEN_STRONG_INLINE
Packet2cd
pmadd
(
const
Packet2cd
&
x
,
const
Packet2cd
&
y
,
const
Packet2cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet2cd
pmul
(
const
Packet2cd
&
a
,
const
Packet2cd
&
b
)
const
{
return
internal
::
pmul
(
pconj
(
a
),
b
);
}
};
template
<
>
struct
conj_helper
<
Packet2cd
,
Packet2cd
,
true
,
true
>
{
EIGEN_STRONG_INLINE
Packet2cd
pmadd
(
const
Packet2cd
&
x
,
const
Packet2cd
&
y
,
const
Packet2cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet2cd
pmul
(
const
Packet2cd
&
a
,
const
Packet2cd
&
b
)
const
{
return
pconj
(
internal
::
pmul
(
a
,
b
));
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL
(
Packet2cd
,
Packet4d
)
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
pdiv
<
Packet2cd
>
(
const
Packet2cd
&
a
,
const
Packet2cd
&
b
)
{
Packet2cd
num
=
pmul
(
a
,
pconj
(
b
));
__m256d
tmp
=
_mm256_mul_pd
(
b
.
v
,
b
.
v
);
__m256d
denom
=
_mm256_hadd_pd
(
tmp
,
tmp
);
return
Packet2cd
(
_mm256_div_pd
(
num
.
v
,
denom
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
pcplxflip
<
Packet2cd
>
(
const
Packet2cd
&
x
)
{
return
Packet2cd
(
_mm256_shuffle_pd
(
x
.
v
,
x
.
v
,
0x5
));
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet4cf
,
4
>&
kernel
)
{
__m256d
P0
=
_mm256_castps_pd
(
kernel
.
packet
[
0
].
v
);
__m256d
P1
=
_mm256_castps_pd
(
kernel
.
packet
[
1
].
v
);
__m256d
P2
=
_mm256_castps_pd
(
kernel
.
packet
[
2
].
v
);
__m256d
P3
=
_mm256_castps_pd
(
kernel
.
packet
[
3
].
v
);
__m256d
T0
=
_mm256_shuffle_pd
(
P0
,
P1
,
15
);
__m256d
T1
=
_mm256_shuffle_pd
(
P0
,
P1
,
0
);
__m256d
T2
=
_mm256_shuffle_pd
(
P2
,
P3
,
15
);
__m256d
T3
=
_mm256_shuffle_pd
(
P2
,
P3
,
0
);
kernel
.
packet
[
1
].
v
=
_mm256_castpd_ps
(
_mm256_permute2f128_pd
(
T0
,
T2
,
32
));
kernel
.
packet
[
3
].
v
=
_mm256_castpd_ps
(
_mm256_permute2f128_pd
(
T0
,
T2
,
49
));
kernel
.
packet
[
0
].
v
=
_mm256_castpd_ps
(
_mm256_permute2f128_pd
(
T1
,
T3
,
32
));
kernel
.
packet
[
2
].
v
=
_mm256_castpd_ps
(
_mm256_permute2f128_pd
(
T1
,
T3
,
49
));
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet2cd
,
2
>&
kernel
)
{
__m256d
tmp
=
_mm256_permute2f128_pd
(
kernel
.
packet
[
0
].
v
,
kernel
.
packet
[
1
].
v
,
0
+
(
2
<<
4
));
kernel
.
packet
[
1
].
v
=
_mm256_permute2f128_pd
(
kernel
.
packet
[
0
].
v
,
kernel
.
packet
[
1
].
v
,
1
+
(
3
<<
4
));
kernel
.
packet
[
0
].
v
=
tmp
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
pinsertfirst
(
const
Packet4cf
&
a
,
std
::
complex
<
float
>
b
)
{
return
Packet4cf
(
_mm256_blend_ps
(
a
.
v
,
pset1
<
Packet4cf
>
(
b
).
v
,
1
|
2
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
pinsertfirst
(
const
Packet2cd
&
a
,
std
::
complex
<
double
>
b
)
{
return
Packet2cd
(
_mm256_blend_pd
(
a
.
v
,
pset1
<
Packet2cd
>
(
b
).
v
,
1
|
2
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4cf
pinsertlast
(
const
Packet4cf
&
a
,
std
::
complex
<
float
>
b
)
{
return
Packet4cf
(
_mm256_blend_ps
(
a
.
v
,
pset1
<
Packet4cf
>
(
b
).
v
,(
1
<<
7
)
|
(
1
<<
6
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cd
pinsertlast
(
const
Packet2cd
&
a
,
std
::
complex
<
double
>
b
)
{
return
Packet2cd
(
_mm256_blend_pd
(
a
.
v
,
pset1
<
Packet2cd
>
(
b
).
v
,(
1
<<
3
)
|
(
1
<<
2
)));
}
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_COMPLEX_AVX_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/AVX/MathFunctions.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_MATH_FUNCTIONS_AVX_H
#define EIGEN_MATH_FUNCTIONS_AVX_H
/* The sin, cos, exp, and log functions of this file are loosely derived from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
namespace
Eigen
{
namespace
internal
{
inline
Packet8i
pshiftleft
(
Packet8i
v
,
int
n
)
{
#ifdef EIGEN_VECTORIZE_AVX2
return
_mm256_slli_epi32
(
v
,
n
);
#else
__m128i
lo
=
_mm_slli_epi32
(
_mm256_extractf128_si256
(
v
,
0
),
n
);
__m128i
hi
=
_mm_slli_epi32
(
_mm256_extractf128_si256
(
v
,
1
),
n
);
return
_mm256_insertf128_si256
(
_mm256_castsi128_si256
(
lo
),
(
hi
),
1
);
#endif
}
inline
Packet8f
pshiftright
(
Packet8f
v
,
int
n
)
{
#ifdef EIGEN_VECTORIZE_AVX2
return
_mm256_cvtepi32_ps
(
_mm256_srli_epi32
(
_mm256_castps_si256
(
v
),
n
));
#else
__m128i
lo
=
_mm_srli_epi32
(
_mm256_extractf128_si256
(
_mm256_castps_si256
(
v
),
0
),
n
);
__m128i
hi
=
_mm_srli_epi32
(
_mm256_extractf128_si256
(
_mm256_castps_si256
(
v
),
1
),
n
);
return
_mm256_cvtepi32_ps
(
_mm256_insertf128_si256
(
_mm256_castsi128_si256
(
lo
),
(
hi
),
1
));
#endif
}
// Sine function
// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and
// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants
// are (anti-)symmetric and thus have only odd/even coefficients
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet8f
psin
<
Packet8f
>
(
const
Packet8f
&
_x
)
{
Packet8f
x
=
_x
;
// Some useful values.
_EIGEN_DECLARE_CONST_Packet8i
(
one
,
1
);
_EIGEN_DECLARE_CONST_Packet8f
(
one
,
1.0
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
two
,
2.0
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
one_over_four
,
0.25
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
one_over_pi
,
3.183098861837907e-01
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
neg_pi_first
,
-
3.140625000000000e+00
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
neg_pi_second
,
-
9.670257568359375e-04
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
neg_pi_third
,
-
6.278329571784980e-07
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
four_over_pi
,
1.273239544735163e+00
f
);
// Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period.
Packet8f
z
=
pmul
(
x
,
p8f_one_over_pi
);
Packet8f
shift
=
_mm256_floor_ps
(
padd
(
z
,
p8f_one_over_four
));
x
=
pmadd
(
shift
,
p8f_neg_pi_first
,
x
);
x
=
pmadd
(
shift
,
p8f_neg_pi_second
,
x
);
x
=
pmadd
(
shift
,
p8f_neg_pi_third
,
x
);
z
=
pmul
(
x
,
p8f_four_over_pi
);
// Make a mask for the entries that need flipping, i.e. wherever the shift
// is odd.
Packet8i
shift_ints
=
_mm256_cvtps_epi32
(
shift
);
Packet8i
shift_isodd
=
_mm256_castps_si256
(
_mm256_and_ps
(
_mm256_castsi256_ps
(
shift_ints
),
_mm256_castsi256_ps
(
p8i_one
)));
Packet8i
sign_flip_mask
=
pshiftleft
(
shift_isodd
,
31
);
// Create a mask for which interpolant to use, i.e. if z > 1, then the mask
// is set to ones for that entry.
Packet8f
ival_mask
=
_mm256_cmp_ps
(
z
,
p8f_one
,
_CMP_GT_OQ
);
// Evaluate the polynomial for the interval [1,3] in z.
_EIGEN_DECLARE_CONST_Packet8f
(
coeff_right_0
,
9.999999724233232e-01
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
coeff_right_2
,
-
3.084242535619928e-01
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
coeff_right_4
,
1.584991525700324e-02
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
coeff_right_6
,
-
3.188805084631342e-04
f
);
Packet8f
z_minus_two
=
psub
(
z
,
p8f_two
);
Packet8f
z_minus_two2
=
pmul
(
z_minus_two
,
z_minus_two
);
Packet8f
right
=
pmadd
(
p8f_coeff_right_6
,
z_minus_two2
,
p8f_coeff_right_4
);
right
=
pmadd
(
right
,
z_minus_two2
,
p8f_coeff_right_2
);
right
=
pmadd
(
right
,
z_minus_two2
,
p8f_coeff_right_0
);
// Evaluate the polynomial for the interval [-1,1] in z.
_EIGEN_DECLARE_CONST_Packet8f
(
coeff_left_1
,
7.853981525427295e-01
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
coeff_left_3
,
-
8.074536727092352e-02
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
coeff_left_5
,
2.489871967827018e-03
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
coeff_left_7
,
-
3.587725841214251e-05
f
);
Packet8f
z2
=
pmul
(
z
,
z
);
Packet8f
left
=
pmadd
(
p8f_coeff_left_7
,
z2
,
p8f_coeff_left_5
);
left
=
pmadd
(
left
,
z2
,
p8f_coeff_left_3
);
left
=
pmadd
(
left
,
z2
,
p8f_coeff_left_1
);
left
=
pmul
(
left
,
z
);
// Assemble the results, i.e. select the left and right polynomials.
left
=
_mm256_andnot_ps
(
ival_mask
,
left
);
right
=
_mm256_and_ps
(
ival_mask
,
right
);
Packet8f
res
=
_mm256_or_ps
(
left
,
right
);
// Flip the sign on the odd intervals and return the result.
res
=
_mm256_xor_ps
(
res
,
_mm256_castsi256_ps
(
sign_flip_mask
));
return
res
;
}
// Natural logarithm
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
// be easily approximated by a polynomial centered on m=1 for stability.
// TODO(gonnet): Further reduce the interval allowing for lower-degree
// polynomial interpolants -> ... -> profit!
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet8f
plog
<
Packet8f
>
(
const
Packet8f
&
_x
)
{
Packet8f
x
=
_x
;
_EIGEN_DECLARE_CONST_Packet8f
(
1
,
1.0
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
half
,
0.5
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
126
f
,
126.0
f
);
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT
(
inv_mant_mask
,
~
0x7f800000
);
// The smallest non denormalized float number.
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT
(
min_norm_pos
,
0x00800000
);
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT
(
minus_inf
,
0xff800000
);
// Polynomial coefficients.
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_SQRTHF
,
0.707106781186547524
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_log_p0
,
7.0376836292E-2
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_log_p1
,
-
1.1514610310E-1
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_log_p2
,
1.1676998740E-1
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_log_p3
,
-
1.2420140846E-1
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_log_p4
,
+
1.4249322787E-1
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_log_p5
,
-
1.6668057665E-1
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_log_p6
,
+
2.0000714765E-1
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_log_p7
,
-
2.4999993993E-1
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_log_p8
,
+
3.3333331174E-1
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_log_q1
,
-
2.12194440e-4
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_log_q2
,
0.693359375
f
);
Packet8f
invalid_mask
=
_mm256_cmp_ps
(
x
,
_mm256_setzero_ps
(),
_CMP_NGE_UQ
);
// not greater equal is true if x is NaN
Packet8f
iszero_mask
=
_mm256_cmp_ps
(
x
,
_mm256_setzero_ps
(),
_CMP_EQ_OQ
);
// Truncate input values to the minimum positive normal.
x
=
pmax
(
x
,
p8f_min_norm_pos
);
Packet8f
emm0
=
pshiftright
(
x
,
23
);
Packet8f
e
=
_mm256_sub_ps
(
emm0
,
p8f_126f
);
// Set the exponents to -1, i.e. x are in the range [0.5,1).
x
=
_mm256_and_ps
(
x
,
p8f_inv_mant_mask
);
x
=
_mm256_or_ps
(
x
,
p8f_half
);
// part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
// and shift by -1. The values are then centered around 0, which improves
// the stability of the polynomial evaluation.
// if( x < SQRTHF ) {
// e -= 1;
// x = x + x - 1.0;
// } else { x = x - 1.0; }
Packet8f
mask
=
_mm256_cmp_ps
(
x
,
p8f_cephes_SQRTHF
,
_CMP_LT_OQ
);
Packet8f
tmp
=
_mm256_and_ps
(
x
,
mask
);
x
=
psub
(
x
,
p8f_1
);
e
=
psub
(
e
,
_mm256_and_ps
(
p8f_1
,
mask
));
x
=
padd
(
x
,
tmp
);
Packet8f
x2
=
pmul
(
x
,
x
);
Packet8f
x3
=
pmul
(
x2
,
x
);
// Evaluate the polynomial approximant of degree 8 in three parts, probably
// to improve instruction-level parallelism.
Packet8f
y
,
y1
,
y2
;
y
=
pmadd
(
p8f_cephes_log_p0
,
x
,
p8f_cephes_log_p1
);
y1
=
pmadd
(
p8f_cephes_log_p3
,
x
,
p8f_cephes_log_p4
);
y2
=
pmadd
(
p8f_cephes_log_p6
,
x
,
p8f_cephes_log_p7
);
y
=
pmadd
(
y
,
x
,
p8f_cephes_log_p2
);
y1
=
pmadd
(
y1
,
x
,
p8f_cephes_log_p5
);
y2
=
pmadd
(
y2
,
x
,
p8f_cephes_log_p8
);
y
=
pmadd
(
y
,
x3
,
y1
);
y
=
pmadd
(
y
,
x3
,
y2
);
y
=
pmul
(
y
,
x3
);
// Add the logarithm of the exponent back to the result of the interpolation.
y1
=
pmul
(
e
,
p8f_cephes_log_q1
);
tmp
=
pmul
(
x2
,
p8f_half
);
y
=
padd
(
y
,
y1
);
x
=
psub
(
x
,
tmp
);
y2
=
pmul
(
e
,
p8f_cephes_log_q2
);
x
=
padd
(
x
,
y
);
x
=
padd
(
x
,
y2
);
// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
return
_mm256_or_ps
(
_mm256_andnot_ps
(
iszero_mask
,
_mm256_or_ps
(
x
,
invalid_mask
)),
_mm256_and_ps
(
iszero_mask
,
p8f_minus_inf
));
}
// Exponential function. Works by writing "x = m*log(2) + r" where
// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet8f
pexp
<
Packet8f
>
(
const
Packet8f
&
_x
)
{
_EIGEN_DECLARE_CONST_Packet8f
(
1
,
1.0
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
half
,
0.5
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
127
,
127.0
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
exp_hi
,
88.3762626647950
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
exp_lo
,
-
88.3762626647949
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_LOG2EF
,
1.44269504088896341
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_exp_p0
,
1.9875691500E-4
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_exp_p1
,
1.3981999507E-3
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_exp_p2
,
8.3334519073E-3
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_exp_p3
,
4.1665795894E-2
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_exp_p4
,
1.6666665459E-1
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_exp_p5
,
5.0000001201E-1
f
);
// Clamp x.
Packet8f
x
=
pmax
(
pmin
(
_x
,
p8f_exp_hi
),
p8f_exp_lo
);
// Express exp(x) as exp(m*ln(2) + r), start by extracting
// m = floor(x/ln(2) + 0.5).
Packet8f
m
=
_mm256_floor_ps
(
pmadd
(
x
,
p8f_cephes_LOG2EF
,
p8f_half
));
// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
// truncation errors. Note that we don't use the "pmadd" function here to
// ensure that a precision-preserving FMA instruction is used.
#ifdef EIGEN_VECTORIZE_FMA
_EIGEN_DECLARE_CONST_Packet8f
(
nln2
,
-
0.6931471805599453
f
);
Packet8f
r
=
_mm256_fmadd_ps
(
m
,
p8f_nln2
,
x
);
#else
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_exp_C1
,
0.693359375
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
cephes_exp_C2
,
-
2.12194440e-4
f
);
Packet8f
r
=
psub
(
x
,
pmul
(
m
,
p8f_cephes_exp_C1
));
r
=
psub
(
r
,
pmul
(
m
,
p8f_cephes_exp_C2
));
#endif
Packet8f
r2
=
pmul
(
r
,
r
);
// TODO(gonnet): Split into odd/even polynomials and try to exploit
// instruction-level parallelism.
Packet8f
y
=
p8f_cephes_exp_p0
;
y
=
pmadd
(
y
,
r
,
p8f_cephes_exp_p1
);
y
=
pmadd
(
y
,
r
,
p8f_cephes_exp_p2
);
y
=
pmadd
(
y
,
r
,
p8f_cephes_exp_p3
);
y
=
pmadd
(
y
,
r
,
p8f_cephes_exp_p4
);
y
=
pmadd
(
y
,
r
,
p8f_cephes_exp_p5
);
y
=
pmadd
(
y
,
r2
,
r
);
y
=
padd
(
y
,
p8f_1
);
// Build emm0 = 2^m.
Packet8i
emm0
=
_mm256_cvttps_epi32
(
padd
(
m
,
p8f_127
));
emm0
=
pshiftleft
(
emm0
,
23
);
// Return 2^m * exp(r).
return
pmax
(
pmul
(
y
,
_mm256_castsi256_ps
(
emm0
)),
_x
);
}
// Hyperbolic Tangent function.
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet8f
ptanh
<
Packet8f
>
(
const
Packet8f
&
x
)
{
return
internal
::
generic_fast_tanh_float
(
x
);
}
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4d
pexp
<
Packet4d
>
(
const
Packet4d
&
_x
)
{
Packet4d
x
=
_x
;
_EIGEN_DECLARE_CONST_Packet4d
(
1
,
1.0
);
_EIGEN_DECLARE_CONST_Packet4d
(
2
,
2.0
);
_EIGEN_DECLARE_CONST_Packet4d
(
half
,
0.5
);
_EIGEN_DECLARE_CONST_Packet4d
(
exp_hi
,
709.437
);
_EIGEN_DECLARE_CONST_Packet4d
(
exp_lo
,
-
709.436139303
);
_EIGEN_DECLARE_CONST_Packet4d
(
cephes_LOG2EF
,
1.4426950408889634073599
);
_EIGEN_DECLARE_CONST_Packet4d
(
cephes_exp_p0
,
1.26177193074810590878e-4
);
_EIGEN_DECLARE_CONST_Packet4d
(
cephes_exp_p1
,
3.02994407707441961300e-2
);
_EIGEN_DECLARE_CONST_Packet4d
(
cephes_exp_p2
,
9.99999999999999999910e-1
);
_EIGEN_DECLARE_CONST_Packet4d
(
cephes_exp_q0
,
3.00198505138664455042e-6
);
_EIGEN_DECLARE_CONST_Packet4d
(
cephes_exp_q1
,
2.52448340349684104192e-3
);
_EIGEN_DECLARE_CONST_Packet4d
(
cephes_exp_q2
,
2.27265548208155028766e-1
);
_EIGEN_DECLARE_CONST_Packet4d
(
cephes_exp_q3
,
2.00000000000000000009e0
);
_EIGEN_DECLARE_CONST_Packet4d
(
cephes_exp_C1
,
0.693145751953125
);
_EIGEN_DECLARE_CONST_Packet4d
(
cephes_exp_C2
,
1.42860682030941723212e-6
);
_EIGEN_DECLARE_CONST_Packet4i
(
1023
,
1023
);
Packet4d
tmp
,
fx
;
// clamp x
x
=
pmax
(
pmin
(
x
,
p4d_exp_hi
),
p4d_exp_lo
);
// Express exp(x) as exp(g + n*log(2)).
fx
=
pmadd
(
p4d_cephes_LOG2EF
,
x
,
p4d_half
);
// Get the integer modulus of log(2), i.e. the "n" described above.
fx
=
_mm256_floor_pd
(
fx
);
// Get the remainder modulo log(2), i.e. the "g" described above. Subtract
// n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
// digits right.
tmp
=
pmul
(
fx
,
p4d_cephes_exp_C1
);
Packet4d
z
=
pmul
(
fx
,
p4d_cephes_exp_C2
);
x
=
psub
(
x
,
tmp
);
x
=
psub
(
x
,
z
);
Packet4d
x2
=
pmul
(
x
,
x
);
// Evaluate the numerator polynomial of the rational interpolant.
Packet4d
px
=
p4d_cephes_exp_p0
;
px
=
pmadd
(
px
,
x2
,
p4d_cephes_exp_p1
);
px
=
pmadd
(
px
,
x2
,
p4d_cephes_exp_p2
);
px
=
pmul
(
px
,
x
);
// Evaluate the denominator polynomial of the rational interpolant.
Packet4d
qx
=
p4d_cephes_exp_q0
;
qx
=
pmadd
(
qx
,
x2
,
p4d_cephes_exp_q1
);
qx
=
pmadd
(
qx
,
x2
,
p4d_cephes_exp_q2
);
qx
=
pmadd
(
qx
,
x2
,
p4d_cephes_exp_q3
);
// I don't really get this bit, copied from the SSE2 routines, so...
// TODO(gonnet): Figure out what is going on here, perhaps find a better
// rational interpolant?
x
=
_mm256_div_pd
(
px
,
psub
(
qx
,
px
));
x
=
pmadd
(
p4d_2
,
x
,
p4d_1
);
// Build e=2^n by constructing the exponents in a 128-bit vector and
// shifting them to where they belong in double-precision values.
__m128i
emm0
=
_mm256_cvtpd_epi32
(
fx
);
emm0
=
_mm_add_epi32
(
emm0
,
p4i_1023
);
emm0
=
_mm_shuffle_epi32
(
emm0
,
_MM_SHUFFLE
(
3
,
1
,
2
,
0
));
__m128i
lo
=
_mm_slli_epi64
(
emm0
,
52
);
__m128i
hi
=
_mm_slli_epi64
(
_mm_srli_epi64
(
emm0
,
32
),
52
);
__m256i
e
=
_mm256_insertf128_si256
(
_mm256_setzero_si256
(),
lo
,
0
);
e
=
_mm256_insertf128_si256
(
e
,
hi
,
1
);
// Construct the result 2^n * exp(g) = e * x. The max is used to catch
// non-finite values in the input.
return
pmax
(
pmul
(
x
,
_mm256_castsi256_pd
(
e
)),
_x
);
}
// Functions for sqrt.
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
// exact solution. It does not handle +inf, or denormalized numbers correctly.
// The main advantage of this approach is not just speed, but also the fact that
// it can be inlined and pipelined with other computations, further reducing its
// effective latency. This is similar to Quake3's fast inverse square root.
// For detail see here: http://www.beyond3d.com/content/articles/8/
#if EIGEN_FAST_MATH
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet8f
psqrt
<
Packet8f
>
(
const
Packet8f
&
_x
)
{
Packet8f
half
=
pmul
(
_x
,
pset1
<
Packet8f
>
(
.5
f
));
Packet8f
denormal_mask
=
_mm256_and_ps
(
_mm256_cmp_ps
(
_x
,
pset1
<
Packet8f
>
((
std
::
numeric_limits
<
float
>::
min
)()),
_CMP_LT_OQ
),
_mm256_cmp_ps
(
_x
,
_mm256_setzero_ps
(),
_CMP_GE_OQ
));
// Compute approximate reciprocal sqrt.
Packet8f
x
=
_mm256_rsqrt_ps
(
_x
);
// Do a single step of Newton's iteration.
x
=
pmul
(
x
,
psub
(
pset1
<
Packet8f
>
(
1.5
f
),
pmul
(
half
,
pmul
(
x
,
x
))));
// Flush results for denormals to zero.
return
_mm256_andnot_ps
(
denormal_mask
,
pmul
(
_x
,
x
));
}
#else
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet8f
psqrt
<
Packet8f
>
(
const
Packet8f
&
x
)
{
return
_mm256_sqrt_ps
(
x
);
}
#endif
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4d
psqrt
<
Packet4d
>
(
const
Packet4d
&
x
)
{
return
_mm256_sqrt_pd
(
x
);
}
#if EIGEN_FAST_MATH
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet8f
prsqrt
<
Packet8f
>
(
const
Packet8f
&
_x
)
{
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT
(
inf
,
0x7f800000
);
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT
(
nan
,
0x7fc00000
);
_EIGEN_DECLARE_CONST_Packet8f
(
one_point_five
,
1.5
f
);
_EIGEN_DECLARE_CONST_Packet8f
(
minus_half
,
-
0.5
f
);
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT
(
flt_min
,
0x00800000
);
Packet8f
neg_half
=
pmul
(
_x
,
p8f_minus_half
);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
Packet8f
le_zero_mask
=
_mm256_cmp_ps
(
_x
,
p8f_flt_min
,
_CMP_LT_OQ
);
Packet8f
x
=
_mm256_andnot_ps
(
le_zero_mask
,
_mm256_rsqrt_ps
(
_x
));
// Fill in NaNs and Infs for the negative/zero entries.
Packet8f
neg_mask
=
_mm256_cmp_ps
(
_x
,
_mm256_setzero_ps
(),
_CMP_LT_OQ
);
Packet8f
zero_mask
=
_mm256_andnot_ps
(
neg_mask
,
le_zero_mask
);
Packet8f
infs_and_nans
=
_mm256_or_ps
(
_mm256_and_ps
(
neg_mask
,
p8f_nan
),
_mm256_and_ps
(
zero_mask
,
p8f_inf
));
// Do a single step of Newton's iteration.
x
=
pmul
(
x
,
pmadd
(
neg_half
,
pmul
(
x
,
x
),
p8f_one_point_five
));
// Insert NaNs and Infs in all the right places.
return
_mm256_or_ps
(
x
,
infs_and_nans
);
}
#else
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet8f
prsqrt
<
Packet8f
>
(
const
Packet8f
&
x
)
{
_EIGEN_DECLARE_CONST_Packet8f
(
one
,
1.0
f
);
return
_mm256_div_ps
(
p8f_one
,
_mm256_sqrt_ps
(
x
));
}
#endif
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4d
prsqrt
<
Packet4d
>
(
const
Packet4d
&
x
)
{
_EIGEN_DECLARE_CONST_Packet4d
(
one
,
1.0
);
return
_mm256_div_pd
(
p4d_one
,
_mm256_sqrt_pd
(
x
));
}
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_AVX_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/AVX/PacketMath.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_AVX_H
#define EIGEN_PACKET_MATH_AVX_H
namespace
Eigen
{
namespace
internal
{
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif
#ifdef __FMA__
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
#endif
typedef
__m256
Packet8f
;
typedef
__m256i
Packet8i
;
typedef
__m256d
Packet4d
;
template
<
>
struct
is_arithmetic
<
__m256
>
{
enum
{
value
=
true
};
};
template
<
>
struct
is_arithmetic
<
__m256i
>
{
enum
{
value
=
true
};
};
template
<
>
struct
is_arithmetic
<
__m256d
>
{
enum
{
value
=
true
};
};
#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
const Packet8f p8f_##NAME = pset1<Packet8f>(X)
#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
const Packet4d p4d_##NAME = pset1<Packet4d>(X)
#define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \
const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1<Packet8i>(X))
#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \
const Packet8i p8i_##NAME = pset1<Packet8i>(X)
// Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
// to leverage AVX512 instructions.
#ifndef EIGEN_VECTORIZE_AVX512
template
<
>
struct
packet_traits
<
float
>
:
default_packet_traits
{
typedef
Packet8f
type
;
typedef
Packet4f
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
8
,
HasHalfPacket
=
1
,
HasDiv
=
1
,
HasSin
=
EIGEN_FAST_MATH
,
HasCos
=
0
,
HasLog
=
1
,
HasExp
=
1
,
HasSqrt
=
1
,
HasRsqrt
=
1
,
HasTanh
=
EIGEN_FAST_MATH
,
HasBlend
=
1
,
HasRound
=
1
,
HasFloor
=
1
,
HasCeil
=
1
};
};
template
<
>
struct
packet_traits
<
double
>
:
default_packet_traits
{
typedef
Packet4d
type
;
typedef
Packet2d
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
,
HasHalfPacket
=
1
,
HasDiv
=
1
,
HasExp
=
1
,
HasSqrt
=
1
,
HasRsqrt
=
1
,
HasBlend
=
1
,
HasRound
=
1
,
HasFloor
=
1
,
HasCeil
=
1
};
};
#endif
template
<
>
struct
scalar_div_cost
<
float
,
true
>
{
enum
{
value
=
14
};
};
template
<
>
struct
scalar_div_cost
<
double
,
true
>
{
enum
{
value
=
16
};
};
/* Proper support for integers is only provided by AVX2. In the meantime, we'll
use SSE instructions and packets to deal with integers.
template<> struct packet_traits<int> : default_packet_traits
{
typedef Packet8i type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=8
};
};
*/
template
<
>
struct
unpacket_traits
<
Packet8f
>
{
typedef
float
type
;
typedef
Packet4f
half
;
enum
{
size
=
8
,
alignment
=
Aligned32
};
};
template
<
>
struct
unpacket_traits
<
Packet4d
>
{
typedef
double
type
;
typedef
Packet2d
half
;
enum
{
size
=
4
,
alignment
=
Aligned32
};
};
template
<
>
struct
unpacket_traits
<
Packet8i
>
{
typedef
int
type
;
typedef
Packet4i
half
;
enum
{
size
=
8
,
alignment
=
Aligned32
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pset1
<
Packet8f
>
(
const
float
&
from
)
{
return
_mm256_set1_ps
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pset1
<
Packet4d
>
(
const
double
&
from
)
{
return
_mm256_set1_pd
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8i
pset1
<
Packet8i
>
(
const
int
&
from
)
{
return
_mm256_set1_epi32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pload1
<
Packet8f
>
(
const
float
*
from
)
{
return
_mm256_broadcast_ss
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pload1
<
Packet4d
>
(
const
double
*
from
)
{
return
_mm256_broadcast_sd
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
plset
<
Packet8f
>
(
const
float
&
a
)
{
return
_mm256_add_ps
(
_mm256_set1_ps
(
a
),
_mm256_set_ps
(
7.0
,
6.0
,
5.0
,
4.0
,
3.0
,
2.0
,
1.0
,
0.0
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
plset
<
Packet4d
>
(
const
double
&
a
)
{
return
_mm256_add_pd
(
_mm256_set1_pd
(
a
),
_mm256_set_pd
(
3.0
,
2.0
,
1.0
,
0.0
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
padd
<
Packet8f
>
(
const
Packet8f
&
a
,
const
Packet8f
&
b
)
{
return
_mm256_add_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
padd
<
Packet4d
>
(
const
Packet4d
&
a
,
const
Packet4d
&
b
)
{
return
_mm256_add_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
psub
<
Packet8f
>
(
const
Packet8f
&
a
,
const
Packet8f
&
b
)
{
return
_mm256_sub_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
psub
<
Packet4d
>
(
const
Packet4d
&
a
,
const
Packet4d
&
b
)
{
return
_mm256_sub_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pnegate
(
const
Packet8f
&
a
)
{
return
_mm256_sub_ps
(
_mm256_set1_ps
(
0.0
),
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pnegate
(
const
Packet4d
&
a
)
{
return
_mm256_sub_pd
(
_mm256_set1_pd
(
0.0
),
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pconj
(
const
Packet8f
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pconj
(
const
Packet4d
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet8i
pconj
(
const
Packet8i
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pmul
<
Packet8f
>
(
const
Packet8f
&
a
,
const
Packet8f
&
b
)
{
return
_mm256_mul_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pmul
<
Packet4d
>
(
const
Packet4d
&
a
,
const
Packet4d
&
b
)
{
return
_mm256_mul_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pdiv
<
Packet8f
>
(
const
Packet8f
&
a
,
const
Packet8f
&
b
)
{
return
_mm256_div_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pdiv
<
Packet4d
>
(
const
Packet4d
&
a
,
const
Packet4d
&
b
)
{
return
_mm256_div_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8i
pdiv
<
Packet8i
>
(
const
Packet8i
&
/*a*/
,
const
Packet8i
&
/*b*/
)
{
eigen_assert
(
false
&&
"packet integer division are not supported by AVX"
);
return
pset1
<
Packet8i
>
(
0
);
}
#ifdef __FMA__
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pmadd
(
const
Packet8f
&
a
,
const
Packet8f
&
b
,
const
Packet8f
&
c
)
{
#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
// Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
// and even register spilling with clang>=6.0 (bug 1637).
// Gcc stupidly generates a vfmadd132ps instruction.
// So let's enforce it to generate a vfmadd231ps instruction since the most common use
// case is to accumulate the result of the product.
Packet8f
res
=
c
;
__asm__
(
"vfmadd231ps %[a], %[b], %[c]"
:
[
c
]
"+x"
(
res
)
:
[
a
]
"x"
(
a
),
[
b
]
"x"
(
b
));
return
res
;
#else
return
_mm256_fmadd_ps
(
a
,
b
,
c
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pmadd
(
const
Packet4d
&
a
,
const
Packet4d
&
b
,
const
Packet4d
&
c
)
{
#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
// see above
Packet4d
res
=
c
;
__asm__
(
"vfmadd231pd %[a], %[b], %[c]"
:
[
c
]
"+x"
(
res
)
:
[
a
]
"x"
(
a
),
[
b
]
"x"
(
b
));
return
res
;
#else
return
_mm256_fmadd_pd
(
a
,
b
,
c
);
#endif
}
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pmin
<
Packet8f
>
(
const
Packet8f
&
a
,
const
Packet8f
&
b
)
{
return
_mm256_min_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pmin
<
Packet4d
>
(
const
Packet4d
&
a
,
const
Packet4d
&
b
)
{
return
_mm256_min_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pmax
<
Packet8f
>
(
const
Packet8f
&
a
,
const
Packet8f
&
b
)
{
return
_mm256_max_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pmax
<
Packet4d
>
(
const
Packet4d
&
a
,
const
Packet4d
&
b
)
{
return
_mm256_max_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pround
<
Packet8f
>
(
const
Packet8f
&
a
)
{
return
_mm256_round_ps
(
a
,
_MM_FROUND_CUR_DIRECTION
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pround
<
Packet4d
>
(
const
Packet4d
&
a
)
{
return
_mm256_round_pd
(
a
,
_MM_FROUND_CUR_DIRECTION
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pceil
<
Packet8f
>
(
const
Packet8f
&
a
)
{
return
_mm256_ceil_ps
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pceil
<
Packet4d
>
(
const
Packet4d
&
a
)
{
return
_mm256_ceil_pd
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pfloor
<
Packet8f
>
(
const
Packet8f
&
a
)
{
return
_mm256_floor_ps
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pfloor
<
Packet4d
>
(
const
Packet4d
&
a
)
{
return
_mm256_floor_pd
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pand
<
Packet8f
>
(
const
Packet8f
&
a
,
const
Packet8f
&
b
)
{
return
_mm256_and_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pand
<
Packet4d
>
(
const
Packet4d
&
a
,
const
Packet4d
&
b
)
{
return
_mm256_and_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
por
<
Packet8f
>
(
const
Packet8f
&
a
,
const
Packet8f
&
b
)
{
return
_mm256_or_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
por
<
Packet4d
>
(
const
Packet4d
&
a
,
const
Packet4d
&
b
)
{
return
_mm256_or_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pxor
<
Packet8f
>
(
const
Packet8f
&
a
,
const
Packet8f
&
b
)
{
return
_mm256_xor_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pxor
<
Packet4d
>
(
const
Packet4d
&
a
,
const
Packet4d
&
b
)
{
return
_mm256_xor_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pandnot
<
Packet8f
>
(
const
Packet8f
&
a
,
const
Packet8f
&
b
)
{
return
_mm256_andnot_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pandnot
<
Packet4d
>
(
const
Packet4d
&
a
,
const
Packet4d
&
b
)
{
return
_mm256_andnot_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pload
<
Packet8f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
_mm256_load_ps
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pload
<
Packet4d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
_mm256_load_pd
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8i
pload
<
Packet8i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
_mm256_load_si256
(
reinterpret_cast
<
const
__m256i
*>
(
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
ploadu
<
Packet8f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
_mm256_loadu_ps
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
ploadu
<
Packet4d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
_mm256_loadu_pd
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8i
ploadu
<
Packet8i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
_mm256_loadu_si256
(
reinterpret_cast
<
const
__m256i
*>
(
from
));
}
// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
ploaddup
<
Packet8f
>
(
const
float
*
from
)
{
// TODO try to find a way to avoid the need of a temporary register
// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from));
// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
// return _mm256_unpacklo_ps(tmp,tmp);
// _mm256_insertf128_ps is very slow on Haswell, thus:
Packet8f
tmp
=
_mm256_broadcast_ps
((
const
__m128
*
)(
const
void
*
)
from
);
// mimic an "inplace" permutation of the lower 128bits using a blend
tmp
=
_mm256_blend_ps
(
tmp
,
_mm256_castps128_ps256
(
_mm_permute_ps
(
_mm256_castps256_ps128
(
tmp
),
_MM_SHUFFLE
(
1
,
0
,
1
,
0
))),
15
);
// then we can perform a consistent permutation on the global register to get everything in shape:
return
_mm256_permute_ps
(
tmp
,
_MM_SHUFFLE
(
3
,
3
,
2
,
2
));
}
// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
ploaddup
<
Packet4d
>
(
const
double
*
from
)
{
Packet4d
tmp
=
_mm256_broadcast_pd
((
const
__m128d
*
)(
const
void
*
)
from
);
return
_mm256_permute_pd
(
tmp
,
3
<<
2
);
}
// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
ploadquad
<
Packet8f
>
(
const
float
*
from
)
{
Packet8f
tmp
=
_mm256_castps128_ps256
(
_mm_broadcast_ss
(
from
));
return
_mm256_insertf128_ps
(
tmp
,
_mm_broadcast_ss
(
from
+
1
),
1
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
Packet8f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
_mm256_store_ps
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
double
>
(
double
*
to
,
const
Packet4d
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
_mm256_store_pd
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
int
>
(
int
*
to
,
const
Packet8i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
_mm256_storeu_si256
(
reinterpret_cast
<
__m256i
*>
(
to
),
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
Packet8f
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
_mm256_storeu_ps
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
double
>
(
double
*
to
,
const
Packet4d
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
_mm256_storeu_pd
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
int
>
(
int
*
to
,
const
Packet8i
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
_mm256_storeu_si256
(
reinterpret_cast
<
__m256i
*>
(
to
),
from
);
}
// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet8f
pgather
<
float
,
Packet8f
>
(
const
float
*
from
,
Index
stride
)
{
return
_mm256_set_ps
(
from
[
7
*
stride
],
from
[
6
*
stride
],
from
[
5
*
stride
],
from
[
4
*
stride
],
from
[
3
*
stride
],
from
[
2
*
stride
],
from
[
1
*
stride
],
from
[
0
*
stride
]);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet4d
pgather
<
double
,
Packet4d
>
(
const
double
*
from
,
Index
stride
)
{
return
_mm256_set_pd
(
from
[
3
*
stride
],
from
[
2
*
stride
],
from
[
1
*
stride
],
from
[
0
*
stride
]);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
float
,
Packet8f
>
(
float
*
to
,
const
Packet8f
&
from
,
Index
stride
)
{
__m128
low
=
_mm256_extractf128_ps
(
from
,
0
);
to
[
stride
*
0
]
=
_mm_cvtss_f32
(
low
);
to
[
stride
*
1
]
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
low
,
low
,
1
));
to
[
stride
*
2
]
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
low
,
low
,
2
));
to
[
stride
*
3
]
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
low
,
low
,
3
));
__m128
high
=
_mm256_extractf128_ps
(
from
,
1
);
to
[
stride
*
4
]
=
_mm_cvtss_f32
(
high
);
to
[
stride
*
5
]
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
high
,
high
,
1
));
to
[
stride
*
6
]
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
high
,
high
,
2
));
to
[
stride
*
7
]
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
high
,
high
,
3
));
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
double
,
Packet4d
>
(
double
*
to
,
const
Packet4d
&
from
,
Index
stride
)
{
__m128d
low
=
_mm256_extractf128_pd
(
from
,
0
);
to
[
stride
*
0
]
=
_mm_cvtsd_f64
(
low
);
to
[
stride
*
1
]
=
_mm_cvtsd_f64
(
_mm_shuffle_pd
(
low
,
low
,
1
));
__m128d
high
=
_mm256_extractf128_pd
(
from
,
1
);
to
[
stride
*
2
]
=
_mm_cvtsd_f64
(
high
);
to
[
stride
*
3
]
=
_mm_cvtsd_f64
(
_mm_shuffle_pd
(
high
,
high
,
1
));
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore1
<
Packet8f
>
(
float
*
to
,
const
float
&
a
)
{
Packet8f
pa
=
pset1
<
Packet8f
>
(
a
);
pstore
(
to
,
pa
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore1
<
Packet4d
>
(
double
*
to
,
const
double
&
a
)
{
Packet4d
pa
=
pset1
<
Packet4d
>
(
a
);
pstore
(
to
,
pa
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore1
<
Packet8i
>
(
int
*
to
,
const
int
&
a
)
{
Packet8i
pa
=
pset1
<
Packet8i
>
(
a
);
pstore
(
to
,
pa
);
}
#ifndef EIGEN_VECTORIZE_AVX512
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
float
>
(
const
float
*
addr
)
{
_mm_prefetch
((
SsePrefetchPtrType
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
double
>
(
const
double
*
addr
)
{
_mm_prefetch
((
SsePrefetchPtrType
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
int
>
(
const
int
*
addr
)
{
_mm_prefetch
((
SsePrefetchPtrType
)(
addr
),
_MM_HINT_T0
);
}
#endif
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet8f
>
(
const
Packet8f
&
a
)
{
return
_mm_cvtss_f32
(
_mm256_castps256_ps128
(
a
));
}
template
<
>
EIGEN_STRONG_INLINE
double
pfirst
<
Packet4d
>
(
const
Packet4d
&
a
)
{
return
_mm_cvtsd_f64
(
_mm256_castpd256_pd128
(
a
));
}
template
<
>
EIGEN_STRONG_INLINE
int
pfirst
<
Packet8i
>
(
const
Packet8i
&
a
)
{
return
_mm_cvtsi128_si32
(
_mm256_castsi256_si128
(
a
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
preverse
(
const
Packet8f
&
a
)
{
__m256
tmp
=
_mm256_shuffle_ps
(
a
,
a
,
0x1b
);
return
_mm256_permute2f128_ps
(
tmp
,
tmp
,
1
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
preverse
(
const
Packet4d
&
a
)
{
__m256d
tmp
=
_mm256_shuffle_pd
(
a
,
a
,
5
);
return
_mm256_permute2f128_pd
(
tmp
,
tmp
,
1
);
#if 0
// This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
// exhibit the same latency/throughput, but it is here for future reference/benchmarking...
__m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
return _mm256_permute_pd(swap_halves,5);
#endif
}
// pabs should be ok
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pabs
(
const
Packet8f
&
a
)
{
const
Packet8f
mask
=
_mm256_castsi256_ps
(
_mm256_setr_epi32
(
0x7FFFFFFF
,
0x7FFFFFFF
,
0x7FFFFFFF
,
0x7FFFFFFF
,
0x7FFFFFFF
,
0x7FFFFFFF
,
0x7FFFFFFF
,
0x7FFFFFFF
));
return
_mm256_and_ps
(
a
,
mask
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pabs
(
const
Packet4d
&
a
)
{
const
Packet4d
mask
=
_mm256_castsi256_pd
(
_mm256_setr_epi32
(
0xFFFFFFFF
,
0x7FFFFFFF
,
0xFFFFFFFF
,
0x7FFFFFFF
,
0xFFFFFFFF
,
0x7FFFFFFF
,
0xFFFFFFFF
,
0x7FFFFFFF
));
return
_mm256_and_pd
(
a
,
mask
);
}
// preduxp should be ok
// FIXME: why is this ok? why isn't the simply implementation working as expected?
template
<
>
EIGEN_STRONG_INLINE
Packet8f
preduxp
<
Packet8f
>
(
const
Packet8f
*
vecs
)
{
__m256
hsum1
=
_mm256_hadd_ps
(
vecs
[
0
],
vecs
[
1
]);
__m256
hsum2
=
_mm256_hadd_ps
(
vecs
[
2
],
vecs
[
3
]);
__m256
hsum3
=
_mm256_hadd_ps
(
vecs
[
4
],
vecs
[
5
]);
__m256
hsum4
=
_mm256_hadd_ps
(
vecs
[
6
],
vecs
[
7
]);
__m256
hsum5
=
_mm256_hadd_ps
(
hsum1
,
hsum1
);
__m256
hsum6
=
_mm256_hadd_ps
(
hsum2
,
hsum2
);
__m256
hsum7
=
_mm256_hadd_ps
(
hsum3
,
hsum3
);
__m256
hsum8
=
_mm256_hadd_ps
(
hsum4
,
hsum4
);
__m256
perm1
=
_mm256_permute2f128_ps
(
hsum5
,
hsum5
,
0x23
);
__m256
perm2
=
_mm256_permute2f128_ps
(
hsum6
,
hsum6
,
0x23
);
__m256
perm3
=
_mm256_permute2f128_ps
(
hsum7
,
hsum7
,
0x23
);
__m256
perm4
=
_mm256_permute2f128_ps
(
hsum8
,
hsum8
,
0x23
);
__m256
sum1
=
_mm256_add_ps
(
perm1
,
hsum5
);
__m256
sum2
=
_mm256_add_ps
(
perm2
,
hsum6
);
__m256
sum3
=
_mm256_add_ps
(
perm3
,
hsum7
);
__m256
sum4
=
_mm256_add_ps
(
perm4
,
hsum8
);
__m256
blend1
=
_mm256_blend_ps
(
sum1
,
sum2
,
0xcc
);
__m256
blend2
=
_mm256_blend_ps
(
sum3
,
sum4
,
0xcc
);
__m256
final
=
_mm256_blend_ps
(
blend1
,
blend2
,
0xf0
);
return
final
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
preduxp
<
Packet4d
>
(
const
Packet4d
*
vecs
)
{
Packet4d
tmp0
,
tmp1
;
tmp0
=
_mm256_hadd_pd
(
vecs
[
0
],
vecs
[
1
]);
tmp0
=
_mm256_add_pd
(
tmp0
,
_mm256_permute2f128_pd
(
tmp0
,
tmp0
,
1
));
tmp1
=
_mm256_hadd_pd
(
vecs
[
2
],
vecs
[
3
]);
tmp1
=
_mm256_add_pd
(
tmp1
,
_mm256_permute2f128_pd
(
tmp1
,
tmp1
,
1
));
return
_mm256_blend_pd
(
tmp0
,
tmp1
,
0xC
);
}
template
<
>
EIGEN_STRONG_INLINE
float
predux
<
Packet8f
>
(
const
Packet8f
&
a
)
{
return
predux
(
Packet4f
(
_mm_add_ps
(
_mm256_castps256_ps128
(
a
),
_mm256_extractf128_ps
(
a
,
1
))));
}
template
<
>
EIGEN_STRONG_INLINE
double
predux
<
Packet4d
>
(
const
Packet4d
&
a
)
{
return
predux
(
Packet2d
(
_mm_add_pd
(
_mm256_castpd256_pd128
(
a
),
_mm256_extractf128_pd
(
a
,
1
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
predux_downto4
<
Packet8f
>
(
const
Packet8f
&
a
)
{
return
_mm_add_ps
(
_mm256_castps256_ps128
(
a
),
_mm256_extractf128_ps
(
a
,
1
));
}
template
<
>
EIGEN_STRONG_INLINE
float
predux_mul
<
Packet8f
>
(
const
Packet8f
&
a
)
{
Packet8f
tmp
;
tmp
=
_mm256_mul_ps
(
a
,
_mm256_permute2f128_ps
(
a
,
a
,
1
));
tmp
=
_mm256_mul_ps
(
tmp
,
_mm256_shuffle_ps
(
tmp
,
tmp
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
)));
return
pfirst
(
_mm256_mul_ps
(
tmp
,
_mm256_shuffle_ps
(
tmp
,
tmp
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
double
predux_mul
<
Packet4d
>
(
const
Packet4d
&
a
)
{
Packet4d
tmp
;
tmp
=
_mm256_mul_pd
(
a
,
_mm256_permute2f128_pd
(
a
,
a
,
1
));
return
pfirst
(
_mm256_mul_pd
(
tmp
,
_mm256_shuffle_pd
(
tmp
,
tmp
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
float
predux_min
<
Packet8f
>
(
const
Packet8f
&
a
)
{
Packet8f
tmp
=
_mm256_min_ps
(
a
,
_mm256_permute2f128_ps
(
a
,
a
,
1
));
tmp
=
_mm256_min_ps
(
tmp
,
_mm256_shuffle_ps
(
tmp
,
tmp
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
)));
return
pfirst
(
_mm256_min_ps
(
tmp
,
_mm256_shuffle_ps
(
tmp
,
tmp
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
double
predux_min
<
Packet4d
>
(
const
Packet4d
&
a
)
{
Packet4d
tmp
=
_mm256_min_pd
(
a
,
_mm256_permute2f128_pd
(
a
,
a
,
1
));
return
pfirst
(
_mm256_min_pd
(
tmp
,
_mm256_shuffle_pd
(
tmp
,
tmp
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
float
predux_max
<
Packet8f
>
(
const
Packet8f
&
a
)
{
Packet8f
tmp
=
_mm256_max_ps
(
a
,
_mm256_permute2f128_ps
(
a
,
a
,
1
));
tmp
=
_mm256_max_ps
(
tmp
,
_mm256_shuffle_ps
(
tmp
,
tmp
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
)));
return
pfirst
(
_mm256_max_ps
(
tmp
,
_mm256_shuffle_ps
(
tmp
,
tmp
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
double
predux_max
<
Packet4d
>
(
const
Packet4d
&
a
)
{
Packet4d
tmp
=
_mm256_max_pd
(
a
,
_mm256_permute2f128_pd
(
a
,
a
,
1
));
return
pfirst
(
_mm256_max_pd
(
tmp
,
_mm256_shuffle_pd
(
tmp
,
tmp
,
1
)));
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet8f
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet8f
&
first
,
const
Packet8f
&
second
)
{
if
(
Offset
==
1
)
{
first
=
_mm256_blend_ps
(
first
,
second
,
1
);
Packet8f
tmp1
=
_mm256_permute_ps
(
first
,
_MM_SHUFFLE
(
0
,
3
,
2
,
1
));
Packet8f
tmp2
=
_mm256_permute2f128_ps
(
tmp1
,
tmp1
,
1
);
first
=
_mm256_blend_ps
(
tmp1
,
tmp2
,
0x88
);
}
else
if
(
Offset
==
2
)
{
first
=
_mm256_blend_ps
(
first
,
second
,
3
);
Packet8f
tmp1
=
_mm256_permute_ps
(
first
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
));
Packet8f
tmp2
=
_mm256_permute2f128_ps
(
tmp1
,
tmp1
,
1
);
first
=
_mm256_blend_ps
(
tmp1
,
tmp2
,
0xcc
);
}
else
if
(
Offset
==
3
)
{
first
=
_mm256_blend_ps
(
first
,
second
,
7
);
Packet8f
tmp1
=
_mm256_permute_ps
(
first
,
_MM_SHUFFLE
(
2
,
1
,
0
,
3
));
Packet8f
tmp2
=
_mm256_permute2f128_ps
(
tmp1
,
tmp1
,
1
);
first
=
_mm256_blend_ps
(
tmp1
,
tmp2
,
0xee
);
}
else
if
(
Offset
==
4
)
{
first
=
_mm256_blend_ps
(
first
,
second
,
15
);
Packet8f
tmp1
=
_mm256_permute_ps
(
first
,
_MM_SHUFFLE
(
3
,
2
,
1
,
0
));
Packet8f
tmp2
=
_mm256_permute2f128_ps
(
tmp1
,
tmp1
,
1
);
first
=
_mm256_permute_ps
(
tmp2
,
_MM_SHUFFLE
(
3
,
2
,
1
,
0
));
}
else
if
(
Offset
==
5
)
{
first
=
_mm256_blend_ps
(
first
,
second
,
31
);
first
=
_mm256_permute2f128_ps
(
first
,
first
,
1
);
Packet8f
tmp
=
_mm256_permute_ps
(
first
,
_MM_SHUFFLE
(
0
,
3
,
2
,
1
));
first
=
_mm256_permute2f128_ps
(
tmp
,
tmp
,
1
);
first
=
_mm256_blend_ps
(
tmp
,
first
,
0x88
);
}
else
if
(
Offset
==
6
)
{
first
=
_mm256_blend_ps
(
first
,
second
,
63
);
first
=
_mm256_permute2f128_ps
(
first
,
first
,
1
);
Packet8f
tmp
=
_mm256_permute_ps
(
first
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
));
first
=
_mm256_permute2f128_ps
(
tmp
,
tmp
,
1
);
first
=
_mm256_blend_ps
(
tmp
,
first
,
0xcc
);
}
else
if
(
Offset
==
7
)
{
first
=
_mm256_blend_ps
(
first
,
second
,
127
);
first
=
_mm256_permute2f128_ps
(
first
,
first
,
1
);
Packet8f
tmp
=
_mm256_permute_ps
(
first
,
_MM_SHUFFLE
(
2
,
1
,
0
,
3
));
first
=
_mm256_permute2f128_ps
(
tmp
,
tmp
,
1
);
first
=
_mm256_blend_ps
(
tmp
,
first
,
0xee
);
}
}
};
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet4d
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet4d
&
first
,
const
Packet4d
&
second
)
{
if
(
Offset
==
1
)
{
first
=
_mm256_blend_pd
(
first
,
second
,
1
);
__m256d
tmp
=
_mm256_permute_pd
(
first
,
5
);
first
=
_mm256_permute2f128_pd
(
tmp
,
tmp
,
1
);
first
=
_mm256_blend_pd
(
tmp
,
first
,
0xA
);
}
else
if
(
Offset
==
2
)
{
first
=
_mm256_blend_pd
(
first
,
second
,
3
);
first
=
_mm256_permute2f128_pd
(
first
,
first
,
1
);
}
else
if
(
Offset
==
3
)
{
first
=
_mm256_blend_pd
(
first
,
second
,
7
);
__m256d
tmp
=
_mm256_permute_pd
(
first
,
5
);
first
=
_mm256_permute2f128_pd
(
tmp
,
tmp
,
1
);
first
=
_mm256_blend_pd
(
tmp
,
first
,
5
);
}
}
};
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet8f
,
8
>&
kernel
)
{
__m256
T0
=
_mm256_unpacklo_ps
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m256
T1
=
_mm256_unpackhi_ps
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m256
T2
=
_mm256_unpacklo_ps
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m256
T3
=
_mm256_unpackhi_ps
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m256
T4
=
_mm256_unpacklo_ps
(
kernel
.
packet
[
4
],
kernel
.
packet
[
5
]);
__m256
T5
=
_mm256_unpackhi_ps
(
kernel
.
packet
[
4
],
kernel
.
packet
[
5
]);
__m256
T6
=
_mm256_unpacklo_ps
(
kernel
.
packet
[
6
],
kernel
.
packet
[
7
]);
__m256
T7
=
_mm256_unpackhi_ps
(
kernel
.
packet
[
6
],
kernel
.
packet
[
7
]);
__m256
S0
=
_mm256_shuffle_ps
(
T0
,
T2
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m256
S1
=
_mm256_shuffle_ps
(
T0
,
T2
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m256
S2
=
_mm256_shuffle_ps
(
T1
,
T3
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m256
S3
=
_mm256_shuffle_ps
(
T1
,
T3
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m256
S4
=
_mm256_shuffle_ps
(
T4
,
T6
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m256
S5
=
_mm256_shuffle_ps
(
T4
,
T6
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m256
S6
=
_mm256_shuffle_ps
(
T5
,
T7
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m256
S7
=
_mm256_shuffle_ps
(
T5
,
T7
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
kernel
.
packet
[
0
]
=
_mm256_permute2f128_ps
(
S0
,
S4
,
0x20
);
kernel
.
packet
[
1
]
=
_mm256_permute2f128_ps
(
S1
,
S5
,
0x20
);
kernel
.
packet
[
2
]
=
_mm256_permute2f128_ps
(
S2
,
S6
,
0x20
);
kernel
.
packet
[
3
]
=
_mm256_permute2f128_ps
(
S3
,
S7
,
0x20
);
kernel
.
packet
[
4
]
=
_mm256_permute2f128_ps
(
S0
,
S4
,
0x31
);
kernel
.
packet
[
5
]
=
_mm256_permute2f128_ps
(
S1
,
S5
,
0x31
);
kernel
.
packet
[
6
]
=
_mm256_permute2f128_ps
(
S2
,
S6
,
0x31
);
kernel
.
packet
[
7
]
=
_mm256_permute2f128_ps
(
S3
,
S7
,
0x31
);
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet8f
,
4
>&
kernel
)
{
__m256
T0
=
_mm256_unpacklo_ps
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m256
T1
=
_mm256_unpackhi_ps
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m256
T2
=
_mm256_unpacklo_ps
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m256
T3
=
_mm256_unpackhi_ps
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m256
S0
=
_mm256_shuffle_ps
(
T0
,
T2
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m256
S1
=
_mm256_shuffle_ps
(
T0
,
T2
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m256
S2
=
_mm256_shuffle_ps
(
T1
,
T3
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m256
S3
=
_mm256_shuffle_ps
(
T1
,
T3
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
kernel
.
packet
[
0
]
=
_mm256_permute2f128_ps
(
S0
,
S1
,
0x20
);
kernel
.
packet
[
1
]
=
_mm256_permute2f128_ps
(
S2
,
S3
,
0x20
);
kernel
.
packet
[
2
]
=
_mm256_permute2f128_ps
(
S0
,
S1
,
0x31
);
kernel
.
packet
[
3
]
=
_mm256_permute2f128_ps
(
S2
,
S3
,
0x31
);
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet4d
,
4
>&
kernel
)
{
__m256d
T0
=
_mm256_shuffle_pd
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
],
15
);
__m256d
T1
=
_mm256_shuffle_pd
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
],
0
);
__m256d
T2
=
_mm256_shuffle_pd
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
],
15
);
__m256d
T3
=
_mm256_shuffle_pd
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
],
0
);
kernel
.
packet
[
1
]
=
_mm256_permute2f128_pd
(
T0
,
T2
,
32
);
kernel
.
packet
[
3
]
=
_mm256_permute2f128_pd
(
T0
,
T2
,
49
);
kernel
.
packet
[
0
]
=
_mm256_permute2f128_pd
(
T1
,
T3
,
32
);
kernel
.
packet
[
2
]
=
_mm256_permute2f128_pd
(
T1
,
T3
,
49
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pblend
(
const
Selector
<
8
>&
ifPacket
,
const
Packet8f
&
thenPacket
,
const
Packet8f
&
elsePacket
)
{
const
__m256
zero
=
_mm256_setzero_ps
();
const
__m256
select
=
_mm256_set_ps
(
ifPacket
.
select
[
7
],
ifPacket
.
select
[
6
],
ifPacket
.
select
[
5
],
ifPacket
.
select
[
4
],
ifPacket
.
select
[
3
],
ifPacket
.
select
[
2
],
ifPacket
.
select
[
1
],
ifPacket
.
select
[
0
]);
__m256
false_mask
=
_mm256_cmp_ps
(
select
,
zero
,
_CMP_EQ_UQ
);
return
_mm256_blendv_ps
(
thenPacket
,
elsePacket
,
false_mask
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pblend
(
const
Selector
<
4
>&
ifPacket
,
const
Packet4d
&
thenPacket
,
const
Packet4d
&
elsePacket
)
{
const
__m256d
zero
=
_mm256_setzero_pd
();
const
__m256d
select
=
_mm256_set_pd
(
ifPacket
.
select
[
3
],
ifPacket
.
select
[
2
],
ifPacket
.
select
[
1
],
ifPacket
.
select
[
0
]);
__m256d
false_mask
=
_mm256_cmp_pd
(
select
,
zero
,
_CMP_EQ_UQ
);
return
_mm256_blendv_pd
(
thenPacket
,
elsePacket
,
false_mask
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pinsertfirst
(
const
Packet8f
&
a
,
float
b
)
{
return
_mm256_blend_ps
(
a
,
pset1
<
Packet8f
>
(
b
),
1
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pinsertfirst
(
const
Packet4d
&
a
,
double
b
)
{
return
_mm256_blend_pd
(
a
,
pset1
<
Packet4d
>
(
b
),
1
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pinsertlast
(
const
Packet8f
&
a
,
float
b
)
{
return
_mm256_blend_ps
(
a
,
pset1
<
Packet8f
>
(
b
),(
1
<<
7
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
pinsertlast
(
const
Packet4d
&
a
,
double
b
)
{
return
_mm256_blend_pd
(
a
,
pset1
<
Packet4d
>
(
b
),(
1
<<
3
));
}
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_PACKET_MATH_AVX_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/AVX/TypeCasting.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TYPE_CASTING_AVX_H
#define EIGEN_TYPE_CASTING_AVX_H
namespace
Eigen
{
namespace
internal
{
// For now we use SSE to handle integers, so we can't use AVX instructions to cast
// from int to float
template
<
>
struct
type_casting_traits
<
float
,
int
>
{
enum
{
VectorizedCast
=
0
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
1
};
};
template
<
>
struct
type_casting_traits
<
int
,
float
>
{
enum
{
VectorizedCast
=
0
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet8i
pcast
<
Packet8f
,
Packet8i
>
(
const
Packet8f
&
a
)
{
return
_mm256_cvtps_epi32
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pcast
<
Packet8i
,
Packet8f
>
(
const
Packet8i
&
a
)
{
return
_mm256_cvtepi32_ps
(
a
);
}
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_TYPE_CASTING_AVX_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/AVX512/MathFunctions.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Pedro Gonnet (pedro.gonnet@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
#define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
namespace
Eigen
{
namespace
internal
{
// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
#if EIGEN_GNUC_AT_LEAST(5, 3)
#define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
const Packet16f p16f_##NAME = pset1<Packet16f>(X)
#define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
const Packet16f p16f_##NAME = (__m512)pset1<Packet16i>(X)
#define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
const Packet8d p8d_##NAME = pset1<Packet8d>(X)
#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
// Natural logarithm
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
// be easily approximated by a polynomial centered on m=1 for stability.
#if defined(EIGEN_VECTORIZE_AVX512DQ)
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet16f
plog
<
Packet16f
>
(
const
Packet16f
&
_x
)
{
Packet16f
x
=
_x
;
_EIGEN_DECLARE_CONST_Packet16f
(
1
,
1.0
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
half
,
0.5
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
126
f
,
126.0
f
);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT
(
inv_mant_mask
,
~
0x7f800000
);
// The smallest non denormalized float number.
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT
(
min_norm_pos
,
0x00800000
);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT
(
minus_inf
,
0xff800000
);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT
(
pos_inf
,
0x7f800000
);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT
(
nan
,
0x7fc00000
);
// Polynomial coefficients.
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_SQRTHF
,
0.707106781186547524
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_log_p0
,
7.0376836292E-2
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_log_p1
,
-
1.1514610310E-1
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_log_p2
,
1.1676998740E-1
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_log_p3
,
-
1.2420140846E-1
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_log_p4
,
+
1.4249322787E-1
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_log_p5
,
-
1.6668057665E-1
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_log_p6
,
+
2.0000714765E-1
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_log_p7
,
-
2.4999993993E-1
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_log_p8
,
+
3.3333331174E-1
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_log_q1
,
-
2.12194440e-4
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_log_q2
,
0.693359375
f
);
// invalid_mask is set to true when x is NaN
__mmask16
invalid_mask
=
_mm512_cmp_ps_mask
(
x
,
_mm512_setzero_ps
(),
_CMP_NGE_UQ
);
__mmask16
iszero_mask
=
_mm512_cmp_ps_mask
(
x
,
_mm512_setzero_ps
(),
_CMP_EQ_OQ
);
// Truncate input values to the minimum positive normal.
x
=
pmax
(
x
,
p16f_min_norm_pos
);
// Extract the shifted exponents.
Packet16f
emm0
=
_mm512_cvtepi32_ps
(
_mm512_srli_epi32
((
__m512i
)
x
,
23
));
Packet16f
e
=
_mm512_sub_ps
(
emm0
,
p16f_126f
);
// Set the exponents to -1, i.e. x are in the range [0.5,1).
x
=
_mm512_and_ps
(
x
,
p16f_inv_mant_mask
);
x
=
_mm512_or_ps
(
x
,
p16f_half
);
// part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
// and shift by -1. The values are then centered around 0, which improves
// the stability of the polynomial evaluation.
// if( x < SQRTHF ) {
// e -= 1;
// x = x + x - 1.0;
// } else { x = x - 1.0; }
__mmask16
mask
=
_mm512_cmp_ps_mask
(
x
,
p16f_cephes_SQRTHF
,
_CMP_LT_OQ
);
Packet16f
tmp
=
_mm512_mask_blend_ps
(
mask
,
_mm512_setzero_ps
(),
x
);
x
=
psub
(
x
,
p16f_1
);
e
=
psub
(
e
,
_mm512_mask_blend_ps
(
mask
,
_mm512_setzero_ps
(),
p16f_1
));
x
=
padd
(
x
,
tmp
);
Packet16f
x2
=
pmul
(
x
,
x
);
Packet16f
x3
=
pmul
(
x2
,
x
);
// Evaluate the polynomial approximant of degree 8 in three parts, probably
// to improve instruction-level parallelism.
Packet16f
y
,
y1
,
y2
;
y
=
pmadd
(
p16f_cephes_log_p0
,
x
,
p16f_cephes_log_p1
);
y1
=
pmadd
(
p16f_cephes_log_p3
,
x
,
p16f_cephes_log_p4
);
y2
=
pmadd
(
p16f_cephes_log_p6
,
x
,
p16f_cephes_log_p7
);
y
=
pmadd
(
y
,
x
,
p16f_cephes_log_p2
);
y1
=
pmadd
(
y1
,
x
,
p16f_cephes_log_p5
);
y2
=
pmadd
(
y2
,
x
,
p16f_cephes_log_p8
);
y
=
pmadd
(
y
,
x3
,
y1
);
y
=
pmadd
(
y
,
x3
,
y2
);
y
=
pmul
(
y
,
x3
);
// Add the logarithm of the exponent back to the result of the interpolation.
y1
=
pmul
(
e
,
p16f_cephes_log_q1
);
tmp
=
pmul
(
x2
,
p16f_half
);
y
=
padd
(
y
,
y1
);
x
=
psub
(
x
,
tmp
);
y2
=
pmul
(
e
,
p16f_cephes_log_q2
);
x
=
padd
(
x
,
y
);
x
=
padd
(
x
,
y2
);
__mmask16
pos_inf_mask
=
_mm512_cmp_ps_mask
(
_x
,
p16f_pos_inf
,
_CMP_EQ_OQ
);
// Filter out invalid inputs, i.e.:
// - negative arg will be NAN,
// - 0 will be -INF.
// - +INF will be +INF
return
_mm512_mask_blend_ps
(
iszero_mask
,
_mm512_mask_blend_ps
(
invalid_mask
,
_mm512_mask_blend_ps
(
pos_inf_mask
,
x
,
p16f_pos_inf
),
p16f_nan
),
p16f_minus_inf
);
}
#endif
// Exponential function. Works by writing "x = m*log(2) + r" where
// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet16f
pexp
<
Packet16f
>
(
const
Packet16f
&
_x
)
{
_EIGEN_DECLARE_CONST_Packet16f
(
1
,
1.0
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
half
,
0.5
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
127
,
127.0
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
exp_hi
,
88.3762626647950
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
exp_lo
,
-
88.3762626647949
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_LOG2EF
,
1.44269504088896341
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_exp_p0
,
1.9875691500E-4
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_exp_p1
,
1.3981999507E-3
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_exp_p2
,
8.3334519073E-3
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_exp_p3
,
4.1665795894E-2
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_exp_p4
,
1.6666665459E-1
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
cephes_exp_p5
,
5.0000001201E-1
f
);
// Clamp x.
Packet16f
x
=
pmax
(
pmin
(
_x
,
p16f_exp_hi
),
p16f_exp_lo
);
// Express exp(x) as exp(m*ln(2) + r), start by extracting
// m = floor(x/ln(2) + 0.5).
Packet16f
m
=
_mm512_floor_ps
(
pmadd
(
x
,
p16f_cephes_LOG2EF
,
p16f_half
));
// Get r = x - m*ln(2). Note that we can do this without losing more than one
// ulp precision due to the FMA instruction.
_EIGEN_DECLARE_CONST_Packet16f
(
nln2
,
-
0.6931471805599453
f
);
Packet16f
r
=
_mm512_fmadd_ps
(
m
,
p16f_nln2
,
x
);
Packet16f
r2
=
pmul
(
r
,
r
);
// TODO(gonnet): Split into odd/even polynomials and try to exploit
// instruction-level parallelism.
Packet16f
y
=
p16f_cephes_exp_p0
;
y
=
pmadd
(
y
,
r
,
p16f_cephes_exp_p1
);
y
=
pmadd
(
y
,
r
,
p16f_cephes_exp_p2
);
y
=
pmadd
(
y
,
r
,
p16f_cephes_exp_p3
);
y
=
pmadd
(
y
,
r
,
p16f_cephes_exp_p4
);
y
=
pmadd
(
y
,
r
,
p16f_cephes_exp_p5
);
y
=
pmadd
(
y
,
r2
,
r
);
y
=
padd
(
y
,
p16f_1
);
// Build emm0 = 2^m.
Packet16i
emm0
=
_mm512_cvttps_epi32
(
padd
(
m
,
p16f_127
));
emm0
=
_mm512_slli_epi32
(
emm0
,
23
);
// Return 2^m * exp(r).
return
pmax
(
pmul
(
y
,
_mm512_castsi512_ps
(
emm0
)),
_x
);
}
/*template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
pexp<Packet8d>(const Packet8d& _x) {
Packet8d x = _x;
_EIGEN_DECLARE_CONST_Packet8d(1, 1.0);
_EIGEN_DECLARE_CONST_Packet8d(2, 2.0);
_EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437);
_EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303);
_EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6);
// clamp x
x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo);
// Express exp(x) as exp(g + n*log(2)).
const Packet8d n =
_mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT);
// Get the remainder modulo log(2), i.e. the "g" described above. Subtract
// n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
// digits right.
const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1);
const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2);
x = psub(x, nC1);
x = psub(x, nC2);
const Packet8d x2 = pmul(x, x);
// Evaluate the numerator polynomial of the rational interpolant.
Packet8d px = p8d_cephes_exp_p0;
px = pmadd(px, x2, p8d_cephes_exp_p1);
px = pmadd(px, x2, p8d_cephes_exp_p2);
px = pmul(px, x);
// Evaluate the denominator polynomial of the rational interpolant.
Packet8d qx = p8d_cephes_exp_q0;
qx = pmadd(qx, x2, p8d_cephes_exp_q1);
qx = pmadd(qx, x2, p8d_cephes_exp_q2);
qx = pmadd(qx, x2, p8d_cephes_exp_q3);
// I don't really get this bit, copied from the SSE2 routines, so...
// TODO(gonnet): Figure out what is going on here, perhaps find a better
// rational interpolant?
x = _mm512_div_pd(px, psub(qx, px));
x = pmadd(p8d_2, x, p8d_1);
// Build e=2^n.
const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64(
_mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52));
// Construct the result 2^n * exp(g) = e * x. The max is used to catch
// non-finite values in the input.
return pmax(pmul(x, e), _x);
}*/
// Functions for sqrt.
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
// exact solution. The main advantage of this approach is not just speed, but
// also the fact that it can be inlined and pipelined with other computations,
// further reducing its effective latency.
#if EIGEN_FAST_MATH
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet16f
psqrt
<
Packet16f
>
(
const
Packet16f
&
_x
)
{
Packet16f
neg_half
=
pmul
(
_x
,
pset1
<
Packet16f
>
(
-
.5
f
));
__mmask16
denormal_mask
=
_mm512_kand
(
_mm512_cmp_ps_mask
(
_x
,
pset1
<
Packet16f
>
((
std
::
numeric_limits
<
float
>::
min
)()),
_CMP_LT_OQ
),
_mm512_cmp_ps_mask
(
_x
,
_mm512_setzero_ps
(),
_CMP_GE_OQ
));
Packet16f
x
=
_mm512_rsqrt14_ps
(
_x
);
// Do a single step of Newton's iteration.
x
=
pmul
(
x
,
pmadd
(
neg_half
,
pmul
(
x
,
x
),
pset1
<
Packet16f
>
(
1.5
f
)));
// Flush results for denormals to zero.
return
_mm512_mask_blend_ps
(
denormal_mask
,
pmul
(
_x
,
x
),
_mm512_setzero_ps
());
}
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet8d
psqrt
<
Packet8d
>
(
const
Packet8d
&
_x
)
{
Packet8d
neg_half
=
pmul
(
_x
,
pset1
<
Packet8d
>
(
-
.5
));
__mmask16
denormal_mask
=
_mm512_kand
(
_mm512_cmp_pd_mask
(
_x
,
pset1
<
Packet8d
>
((
std
::
numeric_limits
<
double
>::
min
)()),
_CMP_LT_OQ
),
_mm512_cmp_pd_mask
(
_x
,
_mm512_setzero_pd
(),
_CMP_GE_OQ
));
Packet8d
x
=
_mm512_rsqrt14_pd
(
_x
);
// Do a single step of Newton's iteration.
x
=
pmul
(
x
,
pmadd
(
neg_half
,
pmul
(
x
,
x
),
pset1
<
Packet8d
>
(
1.5
)));
// Do a second step of Newton's iteration.
x
=
pmul
(
x
,
pmadd
(
neg_half
,
pmul
(
x
,
x
),
pset1
<
Packet8d
>
(
1.5
)));
return
_mm512_mask_blend_pd
(
denormal_mask
,
pmul
(
_x
,
x
),
_mm512_setzero_pd
());
}
#else
template
<
>
EIGEN_STRONG_INLINE
Packet16f
psqrt
<
Packet16f
>
(
const
Packet16f
&
x
)
{
return
_mm512_sqrt_ps
(
x
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
psqrt
<
Packet8d
>
(
const
Packet8d
&
x
)
{
return
_mm512_sqrt_pd
(
x
);
}
#endif
// Functions for rsqrt.
// Almost identical to the sqrt routine, just leave out the last multiplication
// and fill in NaN/Inf where needed. Note that this function only exists as an
// iterative version for doubles since there is no instruction for diretly
// computing the reciprocal square root in AVX-512.
#ifdef EIGEN_FAST_MATH
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet16f
prsqrt
<
Packet16f
>
(
const
Packet16f
&
_x
)
{
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT
(
inf
,
0x7f800000
);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT
(
nan
,
0x7fc00000
);
_EIGEN_DECLARE_CONST_Packet16f
(
one_point_five
,
1.5
f
);
_EIGEN_DECLARE_CONST_Packet16f
(
minus_half
,
-
0.5
f
);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT
(
flt_min
,
0x00800000
);
Packet16f
neg_half
=
pmul
(
_x
,
p16f_minus_half
);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask16
le_zero_mask
=
_mm512_cmp_ps_mask
(
_x
,
p16f_flt_min
,
_CMP_LT_OQ
);
Packet16f
x
=
_mm512_mask_blend_ps
(
le_zero_mask
,
_mm512_rsqrt14_ps
(
_x
),
_mm512_setzero_ps
());
// Fill in NaNs and Infs for the negative/zero entries.
__mmask16
neg_mask
=
_mm512_cmp_ps_mask
(
_x
,
_mm512_setzero_ps
(),
_CMP_LT_OQ
);
Packet16f
infs_and_nans
=
_mm512_mask_blend_ps
(
neg_mask
,
_mm512_mask_blend_ps
(
le_zero_mask
,
_mm512_setzero_ps
(),
p16f_inf
),
p16f_nan
);
// Do a single step of Newton's iteration.
x
=
pmul
(
x
,
pmadd
(
neg_half
,
pmul
(
x
,
x
),
p16f_one_point_five
));
// Insert NaNs and Infs in all the right places.
return
_mm512_mask_blend_ps
(
le_zero_mask
,
x
,
infs_and_nans
);
}
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet8d
prsqrt
<
Packet8d
>
(
const
Packet8d
&
_x
)
{
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64
(
inf
,
0x7ff0000000000000LL
);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64
(
nan
,
0x7ff1000000000000LL
);
_EIGEN_DECLARE_CONST_Packet8d
(
one_point_five
,
1.5
);
_EIGEN_DECLARE_CONST_Packet8d
(
minus_half
,
-
0.5
);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64
(
dbl_min
,
0x0010000000000000LL
);
Packet8d
neg_half
=
pmul
(
_x
,
p8d_minus_half
);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask8
le_zero_mask
=
_mm512_cmp_pd_mask
(
_x
,
p8d_dbl_min
,
_CMP_LT_OQ
);
Packet8d
x
=
_mm512_mask_blend_pd
(
le_zero_mask
,
_mm512_rsqrt14_pd
(
_x
),
_mm512_setzero_pd
());
// Fill in NaNs and Infs for the negative/zero entries.
__mmask8
neg_mask
=
_mm512_cmp_pd_mask
(
_x
,
_mm512_setzero_pd
(),
_CMP_LT_OQ
);
Packet8d
infs_and_nans
=
_mm512_mask_blend_pd
(
neg_mask
,
_mm512_mask_blend_pd
(
le_zero_mask
,
_mm512_setzero_pd
(),
p8d_inf
),
p8d_nan
);
// Do a first step of Newton's iteration.
x
=
pmul
(
x
,
pmadd
(
neg_half
,
pmul
(
x
,
x
),
p8d_one_point_five
));
// Do a second step of Newton's iteration.
x
=
pmul
(
x
,
pmadd
(
neg_half
,
pmul
(
x
,
x
),
p8d_one_point_five
));
// Insert NaNs and Infs in all the right places.
return
_mm512_mask_blend_pd
(
le_zero_mask
,
x
,
infs_and_nans
);
}
#elif defined(EIGEN_VECTORIZE_AVX512ER)
template
<
>
EIGEN_STRONG_INLINE
Packet16f
prsqrt
<
Packet16f
>
(
const
Packet16f
&
x
)
{
return
_mm512_rsqrt28_ps
(
x
);
}
#endif
#endif
}
// end namespace internal
}
// end namespace Eigen
#endif // THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
pydensecrf/densecrf/include/Eigen/src/Core/arch/AVX512/PacketMath.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner (benoit.steiner.goog@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_AVX512_H
#define EIGEN_PACKET_MATH_AVX512_H
namespace
Eigen
{
namespace
internal
{
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
#endif
#ifdef EIGEN_VECTORIZE_FMA
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
#endif
typedef
__m512
Packet16f
;
typedef
__m512i
Packet16i
;
typedef
__m512d
Packet8d
;
template
<
>
struct
is_arithmetic
<
__m512
>
{
enum
{
value
=
true
};
};
template
<
>
struct
is_arithmetic
<
__m512i
>
{
enum
{
value
=
true
};
};
template
<
>
struct
is_arithmetic
<
__m512d
>
{
enum
{
value
=
true
};
};
template
<
>
struct
packet_traits
<
float
>
:
default_packet_traits
{
typedef
Packet16f
type
;
typedef
Packet8f
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
16
,
HasHalfPacket
=
1
,
HasBlend
=
0
,
#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
#ifdef EIGEN_VECTORIZE_AVX512DQ
HasLog
=
1
,
#endif
HasExp
=
1
,
HasSqrt
=
EIGEN_FAST_MATH
,
HasRsqrt
=
EIGEN_FAST_MATH
,
#endif
HasDiv
=
1
};
};
template
<
>
struct
packet_traits
<
double
>
:
default_packet_traits
{
typedef
Packet8d
type
;
typedef
Packet4d
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
8
,
HasHalfPacket
=
1
,
#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
HasSqrt
=
EIGEN_FAST_MATH
,
HasRsqrt
=
EIGEN_FAST_MATH
,
#endif
HasDiv
=
1
};
};
/* TODO Implement AVX512 for integers
template<> struct packet_traits<int> : default_packet_traits
{
typedef Packet16i type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=8
};
};
*/
template
<
>
struct
unpacket_traits
<
Packet16f
>
{
typedef
float
type
;
typedef
Packet8f
half
;
typedef
Packet16i
integer_packet
;
enum
{
size
=
16
,
alignment
=
Aligned64
};
};
template
<
>
struct
unpacket_traits
<
Packet8d
>
{
typedef
double
type
;
typedef
Packet4d
half
;
enum
{
size
=
8
,
alignment
=
Aligned64
};
};
template
<
>
struct
unpacket_traits
<
Packet16i
>
{
typedef
int
type
;
typedef
Packet8i
half
;
enum
{
size
=
16
,
alignment
=
Aligned64
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pset1
<
Packet16f
>
(
const
float
&
from
)
{
return
_mm512_set1_ps
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pset1
<
Packet8d
>
(
const
double
&
from
)
{
return
_mm512_set1_pd
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
pset1
<
Packet16i
>
(
const
int
&
from
)
{
return
_mm512_set1_epi32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pload1
<
Packet16f
>
(
const
float
*
from
)
{
return
_mm512_broadcastss_ps
(
_mm_load_ps1
(
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pload1
<
Packet8d
>
(
const
double
*
from
)
{
return
_mm512_set1_pd
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
plset
<
Packet16f
>
(
const
float
&
a
)
{
return
_mm512_add_ps
(
_mm512_set1_ps
(
a
),
_mm512_set_ps
(
15.0
f
,
14.0
f
,
13.0
f
,
12.0
f
,
11.0
f
,
10.0
f
,
9.0
f
,
8.0
f
,
7.0
f
,
6.0
f
,
5.0
f
,
4.0
f
,
3.0
f
,
2.0
f
,
1.0
f
,
0.0
f
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
plset
<
Packet8d
>
(
const
double
&
a
)
{
return
_mm512_add_pd
(
_mm512_set1_pd
(
a
),
_mm512_set_pd
(
7.0
,
6.0
,
5.0
,
4.0
,
3.0
,
2.0
,
1.0
,
0.0
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
padd
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
return
_mm512_add_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
padd
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
return
_mm512_add_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
padd
<
Packet16i
>
(
const
Packet16i
&
a
,
const
Packet16i
&
b
)
{
return
_mm512_add_epi32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
psub
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
return
_mm512_sub_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
psub
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
return
_mm512_sub_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
psub
<
Packet16i
>
(
const
Packet16i
&
a
,
const
Packet16i
&
b
)
{
return
_mm512_sub_epi32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pnegate
(
const
Packet16f
&
a
)
{
return
_mm512_sub_ps
(
_mm512_set1_ps
(
0.0
),
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pnegate
(
const
Packet8d
&
a
)
{
return
_mm512_sub_pd
(
_mm512_set1_pd
(
0.0
),
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pconj
(
const
Packet16f
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pconj
(
const
Packet8d
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
pconj
(
const
Packet16i
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pmul
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
return
_mm512_mul_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pmul
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
return
_mm512_mul_pd
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
pmul
<
Packet16i
>
(
const
Packet16i
&
a
,
const
Packet16i
&
b
)
{
return
_mm512_mul_epi32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pdiv
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
return
_mm512_div_ps
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pdiv
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
return
_mm512_div_pd
(
a
,
b
);
}
#ifdef EIGEN_VECTORIZE_FMA
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pmadd
(
const
Packet16f
&
a
,
const
Packet16f
&
b
,
const
Packet16f
&
c
)
{
return
_mm512_fmadd_ps
(
a
,
b
,
c
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pmadd
(
const
Packet8d
&
a
,
const
Packet8d
&
b
,
const
Packet8d
&
c
)
{
return
_mm512_fmadd_pd
(
a
,
b
,
c
);
}
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pmin
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
// Arguments are reversed to match NaN propagation behavior of std::min.
return
_mm512_min_ps
(
b
,
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pmin
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
// Arguments are reversed to match NaN propagation behavior of std::min.
return
_mm512_min_pd
(
b
,
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pmax
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
// Arguments are reversed to match NaN propagation behavior of std::max.
return
_mm512_max_ps
(
b
,
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pmax
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
// Arguments are reversed to match NaN propagation behavior of std::max.
return
_mm512_max_pd
(
b
,
a
);
}
#ifdef EIGEN_VECTORIZE_AVX512DQ
template
<
int
I_
>
EIGEN_STRONG_INLINE
Packet8f
extract256
(
Packet16f
x
)
{
return
_mm512_extractf32x8_ps
(
x
,
I_
);
}
template
<
int
I_
>
EIGEN_STRONG_INLINE
Packet2d
extract128
(
Packet8d
x
)
{
return
_mm512_extractf64x2_pd
(
x
,
I_
);
}
EIGEN_STRONG_INLINE
Packet16f
cat256
(
Packet8f
a
,
Packet8f
b
)
{
return
_mm512_insertf32x8
(
_mm512_castps256_ps512
(
a
),
b
,
1
);
}
#else
// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
template
<
int
I_
>
EIGEN_STRONG_INLINE
Packet8f
extract256
(
Packet16f
x
)
{
return
_mm256_castsi256_ps
(
_mm512_extracti64x4_epi64
(
_mm512_castps_si512
(
x
),
I_
));
}
// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
template
<
int
I_
>
EIGEN_STRONG_INLINE
Packet2d
extract128
(
Packet8d
x
)
{
return
_mm_castsi128_pd
(
_mm512_extracti32x4_epi32
(
_mm512_castpd_si512
(
x
),
I_
));
}
EIGEN_STRONG_INLINE
Packet16f
cat256
(
Packet8f
a
,
Packet8f
b
)
{
return
_mm512_castsi512_ps
(
_mm512_inserti64x4
(
_mm512_castsi256_si512
(
_mm256_castps_si256
(
a
)),
_mm256_castps_si256
(
b
),
1
));
}
#endif
// Helper function for bit packing snippet of low precision comparison.
// It packs the flags from 32x16 to 16x16.
EIGEN_STRONG_INLINE
__m256i
Pack32To16
(
Packet16f
rf
)
{
// Split data into small pieces and handle with AVX instructions
// to guarantee internal order of vector.
// Operation:
// dst[15:0] := Saturate16(rf[31:0])
// dst[31:16] := Saturate16(rf[63:32])
// ...
// dst[255:240] := Saturate16(rf[255:224])
__m256i
lo
=
_mm256_castps_si256
(
extract256
<
0
>
(
rf
));
__m256i
hi
=
_mm256_castps_si256
(
extract256
<
1
>
(
rf
));
__m128i
result_lo
=
_mm_packs_epi32
(
_mm256_extractf128_si256
(
lo
,
0
),
_mm256_extractf128_si256
(
lo
,
1
));
__m128i
result_hi
=
_mm_packs_epi32
(
_mm256_extractf128_si256
(
hi
,
0
),
_mm256_extractf128_si256
(
hi
,
1
));
return
_mm256_insertf128_si256
(
_mm256_castsi128_si256
(
result_lo
),
result_hi
,
1
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
pand
<
Packet16i
>
(
const
Packet16i
&
a
,
const
Packet16i
&
b
)
{
return
_mm512_and_si512
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pand
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_and_ps
(
a
,
b
);
#else
return
_mm512_castsi512_ps
(
pand
(
_mm512_castps_si512
(
a
),
_mm512_castps_si512
(
b
)));
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pand
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_and_pd
(
a
,
b
);
#else
Packet8d
res
=
_mm512_undefined_pd
();
Packet4d
lane0_a
=
_mm512_extractf64x4_pd
(
a
,
0
);
Packet4d
lane0_b
=
_mm512_extractf64x4_pd
(
b
,
0
);
res
=
_mm512_insertf64x4
(
res
,
_mm256_and_pd
(
lane0_a
,
lane0_b
),
0
);
Packet4d
lane1_a
=
_mm512_extractf64x4_pd
(
a
,
1
);
Packet4d
lane1_b
=
_mm512_extractf64x4_pd
(
b
,
1
);
return
_mm512_insertf64x4
(
res
,
_mm256_and_pd
(
lane1_a
,
lane1_b
),
1
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
por
<
Packet16i
>
(
const
Packet16i
&
a
,
const
Packet16i
&
b
)
{
return
_mm512_or_si512
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
por
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_or_ps
(
a
,
b
);
#else
return
_mm512_castsi512_ps
(
por
(
_mm512_castps_si512
(
a
),
_mm512_castps_si512
(
b
)));
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
por
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_or_pd
(
a
,
b
);
#else
return
_mm512_castsi512_pd
(
por
(
_mm512_castpd_si512
(
a
),
_mm512_castpd_si512
(
b
)));
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
pxor
<
Packet16i
>
(
const
Packet16i
&
a
,
const
Packet16i
&
b
)
{
return
_mm512_xor_si512
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pxor
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_xor_ps
(
a
,
b
);
#else
return
_mm512_castsi512_ps
(
pxor
(
_mm512_castps_si512
(
a
),
_mm512_castps_si512
(
b
)));
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pxor
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_xor_pd
(
a
,
b
);
#else
return
_mm512_castsi512_pd
(
pxor
(
_mm512_castpd_si512
(
a
),
_mm512_castpd_si512
(
b
)));
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
pandnot
<
Packet16i
>
(
const
Packet16i
&
a
,
const
Packet16i
&
b
)
{
return
_mm512_andnot_si512
(
b
,
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pandnot
<
Packet16f
>
(
const
Packet16f
&
a
,
const
Packet16f
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_andnot_ps
(
b
,
a
);
#else
return
_mm512_castsi512_ps
(
pandnot
(
_mm512_castps_si512
(
a
),
_mm512_castps_si512
(
b
)));
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pandnot
<
Packet8d
>
(
const
Packet8d
&
a
,
const
Packet8d
&
b
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
return
_mm512_andnot_pd
(
b
,
a
);
#else
return
_mm512_castsi512_pd
(
pandnot
(
_mm512_castpd_si512
(
a
),
_mm512_castpd_si512
(
b
)));
#endif
}
template
<
int
N
>
EIGEN_STRONG_INLINE
Packet16i
parithmetic_shift_right
(
Packet16i
a
)
{
return
_mm512_srai_epi32
(
a
,
N
);
}
template
<
int
N
>
EIGEN_STRONG_INLINE
Packet16i
plogical_shift_right
(
Packet16i
a
)
{
return
_mm512_srli_epi32
(
a
,
N
);
}
template
<
int
N
>
EIGEN_STRONG_INLINE
Packet16i
plogical_shift_left
(
Packet16i
a
)
{
return
_mm512_slli_epi32
(
a
,
N
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pload
<
Packet16f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
_mm512_load_ps
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pload
<
Packet8d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
_mm512_load_pd
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
pload
<
Packet16i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
_mm512_load_si512
(
reinterpret_cast
<
const
__m512i
*>
(
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
ploadu
<
Packet16f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
_mm512_loadu_ps
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
ploadu
<
Packet8d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
_mm512_loadu_pd
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
ploadu
<
Packet16i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
_mm512_loadu_si512
(
reinterpret_cast
<
const
__m512i
*>
(
from
));
}
// Loads 8 floats from memory a returns the packet
// {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
ploaddup
<
Packet16f
>
(
const
float
*
from
)
{
// an unaligned load is required here as there is no requirement
// on the alignment of input pointer 'from'
__m256i
low_half
=
_mm256_loadu_si256
(
reinterpret_cast
<
const
__m256i
*>
(
from
));
__m512
even_elements
=
_mm512_castsi512_ps
(
_mm512_cvtepu32_epi64
(
low_half
));
__m512
pairs
=
_mm512_permute_ps
(
even_elements
,
_MM_SHUFFLE
(
2
,
2
,
0
,
0
));
return
pairs
;
}
#ifdef EIGEN_VECTORIZE_AVX512DQ
// FIXME: this does not look optimal, better load a Packet4d and shuffle...
// Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3,
// a3}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
ploaddup
<
Packet8d
>
(
const
double
*
from
)
{
__m512d
x
=
_mm512_setzero_pd
();
x
=
_mm512_insertf64x2
(
x
,
_mm_loaddup_pd
(
&
from
[
0
]),
0
);
x
=
_mm512_insertf64x2
(
x
,
_mm_loaddup_pd
(
&
from
[
1
]),
1
);
x
=
_mm512_insertf64x2
(
x
,
_mm_loaddup_pd
(
&
from
[
2
]),
2
);
x
=
_mm512_insertf64x2
(
x
,
_mm_loaddup_pd
(
&
from
[
3
]),
3
);
return
x
;
}
#else
template
<
>
EIGEN_STRONG_INLINE
Packet8d
ploaddup
<
Packet8d
>
(
const
double
*
from
)
{
__m512d
x
=
_mm512_setzero_pd
();
x
=
_mm512_mask_broadcastsd_pd
(
x
,
0x3
<<
0
,
_mm_load_sd
(
from
+
0
));
x
=
_mm512_mask_broadcastsd_pd
(
x
,
0x3
<<
2
,
_mm_load_sd
(
from
+
1
));
x
=
_mm512_mask_broadcastsd_pd
(
x
,
0x3
<<
4
,
_mm_load_sd
(
from
+
2
));
x
=
_mm512_mask_broadcastsd_pd
(
x
,
0x3
<<
6
,
_mm_load_sd
(
from
+
3
));
return
x
;
}
#endif
// Loads 4 floats from memory a returns the packet
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
ploadquad
<
Packet16f
>
(
const
float
*
from
)
{
Packet16f
tmp
=
_mm512_castps128_ps512
(
ploadu
<
Packet4f
>
(
from
));
const
Packet16i
scatter_mask
=
_mm512_set_epi32
(
3
,
3
,
3
,
3
,
2
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
);
return
_mm512_permutexvar_ps
(
scatter_mask
,
tmp
);
}
// Loads 2 doubles from memory a returns the packet
// {a0, a0 a0, a0, a1, a1, a1, a1}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
ploadquad
<
Packet8d
>
(
const
double
*
from
)
{
__m256d
lane0
=
_mm256_set1_pd
(
*
from
);
__m256d
lane1
=
_mm256_set1_pd
(
*
(
from
+
1
));
__m512d
tmp
=
_mm512_undefined_pd
();
tmp
=
_mm512_insertf64x4
(
tmp
,
lane0
,
0
);
return
_mm512_insertf64x4
(
tmp
,
lane1
,
1
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
Packet16f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
_mm512_store_ps
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
double
>
(
double
*
to
,
const
Packet8d
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
_mm512_store_pd
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
int
>
(
int
*
to
,
const
Packet16i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
_mm512_storeu_si512
(
reinterpret_cast
<
__m512i
*>
(
to
),
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
Packet16f
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
_mm512_storeu_ps
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
double
>
(
double
*
to
,
const
Packet8d
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
_mm512_storeu_pd
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
int
>
(
int
*
to
,
const
Packet16i
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
_mm512_storeu_si512
(
reinterpret_cast
<
__m512i
*>
(
to
),
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet16f
pgather
<
float
,
Packet16f
>
(
const
float
*
from
,
Index
stride
)
{
Packet16i
stride_vector
=
_mm512_set1_epi32
(
convert_index
<
int
>
(
stride
));
Packet16i
stride_multiplier
=
_mm512_set_epi32
(
15
,
14
,
13
,
12
,
11
,
10
,
9
,
8
,
7
,
6
,
5
,
4
,
3
,
2
,
1
,
0
);
Packet16i
indices
=
_mm512_mullo_epi32
(
stride_vector
,
stride_multiplier
);
return
_mm512_i32gather_ps
(
indices
,
from
,
4
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet8d
pgather
<
double
,
Packet8d
>
(
const
double
*
from
,
Index
stride
)
{
Packet8i
stride_vector
=
_mm256_set1_epi32
(
convert_index
<
int
>
(
stride
));
Packet8i
stride_multiplier
=
_mm256_set_epi32
(
7
,
6
,
5
,
4
,
3
,
2
,
1
,
0
);
Packet8i
indices
=
_mm256_mullo_epi32
(
stride_vector
,
stride_multiplier
);
return
_mm512_i32gather_pd
(
indices
,
from
,
8
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
float
,
Packet16f
>
(
float
*
to
,
const
Packet16f
&
from
,
Index
stride
)
{
Packet16i
stride_vector
=
_mm512_set1_epi32
(
convert_index
<
int
>
(
stride
));
Packet16i
stride_multiplier
=
_mm512_set_epi32
(
15
,
14
,
13
,
12
,
11
,
10
,
9
,
8
,
7
,
6
,
5
,
4
,
3
,
2
,
1
,
0
);
Packet16i
indices
=
_mm512_mullo_epi32
(
stride_vector
,
stride_multiplier
);
_mm512_i32scatter_ps
(
to
,
indices
,
from
,
4
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
double
,
Packet8d
>
(
double
*
to
,
const
Packet8d
&
from
,
Index
stride
)
{
Packet8i
stride_vector
=
_mm256_set1_epi32
(
convert_index
<
int
>
(
stride
));
Packet8i
stride_multiplier
=
_mm256_set_epi32
(
7
,
6
,
5
,
4
,
3
,
2
,
1
,
0
);
Packet8i
indices
=
_mm256_mullo_epi32
(
stride_vector
,
stride_multiplier
);
_mm512_i32scatter_pd
(
to
,
indices
,
from
,
8
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore1
<
Packet16f
>
(
float
*
to
,
const
float
&
a
)
{
Packet16f
pa
=
pset1
<
Packet16f
>
(
a
);
pstore
(
to
,
pa
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore1
<
Packet8d
>
(
double
*
to
,
const
double
&
a
)
{
Packet8d
pa
=
pset1
<
Packet8d
>
(
a
);
pstore
(
to
,
pa
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore1
<
Packet16i
>
(
int
*
to
,
const
int
&
a
)
{
Packet16i
pa
=
pset1
<
Packet16i
>
(
a
);
pstore
(
to
,
pa
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
float
>
(
const
float
*
addr
)
{
_mm_prefetch
((
SsePrefetchPtrType
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
double
>
(
const
double
*
addr
)
{
_mm_prefetch
((
SsePrefetchPtrType
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
int
>
(
const
int
*
addr
)
{
_mm_prefetch
((
SsePrefetchPtrType
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet16f
>
(
const
Packet16f
&
a
)
{
return
_mm_cvtss_f32
(
_mm512_extractf32x4_ps
(
a
,
0
));
}
template
<
>
EIGEN_STRONG_INLINE
double
pfirst
<
Packet8d
>
(
const
Packet8d
&
a
)
{
return
_mm_cvtsd_f64
(
_mm256_extractf128_pd
(
_mm512_extractf64x4_pd
(
a
,
0
),
0
));
}
template
<
>
EIGEN_STRONG_INLINE
int
pfirst
<
Packet16i
>
(
const
Packet16i
&
a
)
{
return
_mm_extract_epi32
(
_mm512_extracti32x4_epi32
(
a
,
0
),
0
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
preverse
(
const
Packet16f
&
a
)
{
return
_mm512_permutexvar_ps
(
_mm512_set_epi32
(
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
),
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
preverse
(
const
Packet8d
&
a
)
{
return
_mm512_permutexvar_pd
(
_mm512_set_epi32
(
0
,
0
,
0
,
1
,
0
,
2
,
0
,
3
,
0
,
4
,
0
,
5
,
0
,
6
,
0
,
7
),
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pabs
(
const
Packet16f
&
a
)
{
// _mm512_abs_ps intrinsic not found, so hack around it
return
_mm512_castsi512_ps
(
_mm512_and_si512
(
_mm512_castps_si512
(
a
),
_mm512_set1_epi32
(
0x7fffffff
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pabs
(
const
Packet8d
&
a
)
{
// _mm512_abs_ps intrinsic not found, so hack around it
return
_mm512_castsi512_pd
(
_mm512_and_si512
(
_mm512_castpd_si512
(
a
),
_mm512_set1_epi64
(
0x7fffffffffffffff
)));
}
#ifdef EIGEN_VECTORIZE_AVX512DQ
// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
__m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
__m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
#else
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
__m256 OUTPUT##_0 = _mm256_insertf128_ps( \
_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
_mm512_extractf32x4_ps(INPUT, 1), 1); \
__m256 OUTPUT##_1 = _mm256_insertf128_ps( \
_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
_mm512_extractf32x4_ps(INPUT, 3), 1);
#endif
#ifdef EIGEN_VECTORIZE_AVX512DQ
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
#else
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_undefined_ps(); \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
#endif
template
<
>
EIGEN_STRONG_INLINE
float
predux
<
Packet16f
>
(
const
Packet16f
&
a
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
__m256
lane0
=
_mm512_extractf32x8_ps
(
a
,
0
);
__m256
lane1
=
_mm512_extractf32x8_ps
(
a
,
1
);
Packet8f
x
=
_mm256_add_ps
(
lane0
,
lane1
);
return
predux
<
Packet8f
>
(
x
);
#else
__m128
lane0
=
_mm512_extractf32x4_ps
(
a
,
0
);
__m128
lane1
=
_mm512_extractf32x4_ps
(
a
,
1
);
__m128
lane2
=
_mm512_extractf32x4_ps
(
a
,
2
);
__m128
lane3
=
_mm512_extractf32x4_ps
(
a
,
3
);
__m128
sum
=
_mm_add_ps
(
_mm_add_ps
(
lane0
,
lane1
),
_mm_add_ps
(
lane2
,
lane3
));
sum
=
_mm_hadd_ps
(
sum
,
sum
);
sum
=
_mm_hadd_ps
(
sum
,
_mm_permute_ps
(
sum
,
1
));
return
_mm_cvtss_f32
(
sum
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
double
predux
<
Packet8d
>
(
const
Packet8d
&
a
)
{
__m256d
lane0
=
_mm512_extractf64x4_pd
(
a
,
0
);
__m256d
lane1
=
_mm512_extractf64x4_pd
(
a
,
1
);
__m256d
sum
=
_mm256_add_pd
(
lane0
,
lane1
);
__m256d
tmp0
=
_mm256_hadd_pd
(
sum
,
_mm256_permute2f128_pd
(
sum
,
sum
,
1
));
return
_mm_cvtsd_f64
(
_mm256_castpd256_pd128
(
_mm256_hadd_pd
(
tmp0
,
tmp0
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet8f
predux_downto4
<
Packet16f
>
(
const
Packet16f
&
a
)
{
#ifdef EIGEN_VECTORIZE_AVX512DQ
Packet8f
lane0
=
_mm512_extractf32x8_ps
(
a
,
0
);
Packet8f
lane1
=
_mm512_extractf32x8_ps
(
a
,
1
);
return
padd
(
lane0
,
lane1
);
#else
Packet4f
lane0
=
_mm512_extractf32x4_ps
(
a
,
0
);
Packet4f
lane1
=
_mm512_extractf32x4_ps
(
a
,
1
);
Packet4f
lane2
=
_mm512_extractf32x4_ps
(
a
,
2
);
Packet4f
lane3
=
_mm512_extractf32x4_ps
(
a
,
3
);
Packet4f
sum0
=
padd
(
lane0
,
lane2
);
Packet4f
sum1
=
padd
(
lane1
,
lane3
);
return
_mm256_insertf128_ps
(
_mm256_castps128_ps256
(
sum0
),
sum1
,
1
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4d
predux_downto4
<
Packet8d
>
(
const
Packet8d
&
a
)
{
Packet4d
lane0
=
_mm512_extractf64x4_pd
(
a
,
0
);
Packet4d
lane1
=
_mm512_extractf64x4_pd
(
a
,
1
);
Packet4d
res
=
padd
(
lane0
,
lane1
);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
float
predux_mul
<
Packet16f
>
(
const
Packet16f
&
a
)
{
//#ifdef EIGEN_VECTORIZE_AVX512DQ
#if 0
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
Packet8f res = pmul(lane0, lane1);
res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
#else
__m128
lane0
=
_mm512_extractf32x4_ps
(
a
,
0
);
__m128
lane1
=
_mm512_extractf32x4_ps
(
a
,
1
);
__m128
lane2
=
_mm512_extractf32x4_ps
(
a
,
2
);
__m128
lane3
=
_mm512_extractf32x4_ps
(
a
,
3
);
__m128
res
=
pmul
(
pmul
(
lane0
,
lane1
),
pmul
(
lane2
,
lane3
));
res
=
pmul
(
res
,
_mm_permute_ps
(
res
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
return
pfirst
(
pmul
(
res
,
_mm_permute_ps
(
res
,
_MM_SHUFFLE
(
0
,
0
,
0
,
1
))));
#endif
}
template
<
>
EIGEN_STRONG_INLINE
double
predux_mul
<
Packet8d
>
(
const
Packet8d
&
a
)
{
__m256d
lane0
=
_mm512_extractf64x4_pd
(
a
,
0
);
__m256d
lane1
=
_mm512_extractf64x4_pd
(
a
,
1
);
__m256d
res
=
pmul
(
lane0
,
lane1
);
res
=
pmul
(
res
,
_mm256_permute2f128_pd
(
res
,
res
,
1
));
return
pfirst
(
pmul
(
res
,
_mm256_shuffle_pd
(
res
,
res
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
float
predux_min
<
Packet16f
>
(
const
Packet16f
&
a
)
{
__m128
lane0
=
_mm512_extractf32x4_ps
(
a
,
0
);
__m128
lane1
=
_mm512_extractf32x4_ps
(
a
,
1
);
__m128
lane2
=
_mm512_extractf32x4_ps
(
a
,
2
);
__m128
lane3
=
_mm512_extractf32x4_ps
(
a
,
3
);
__m128
res
=
_mm_min_ps
(
_mm_min_ps
(
lane0
,
lane1
),
_mm_min_ps
(
lane2
,
lane3
));
res
=
_mm_min_ps
(
res
,
_mm_permute_ps
(
res
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
return
pfirst
(
_mm_min_ps
(
res
,
_mm_permute_ps
(
res
,
_MM_SHUFFLE
(
0
,
0
,
0
,
1
))));
}
template
<
>
EIGEN_STRONG_INLINE
double
predux_min
<
Packet8d
>
(
const
Packet8d
&
a
)
{
__m256d
lane0
=
_mm512_extractf64x4_pd
(
a
,
0
);
__m256d
lane1
=
_mm512_extractf64x4_pd
(
a
,
1
);
__m256d
res
=
_mm256_min_pd
(
lane0
,
lane1
);
res
=
_mm256_min_pd
(
res
,
_mm256_permute2f128_pd
(
res
,
res
,
1
));
return
pfirst
(
_mm256_min_pd
(
res
,
_mm256_shuffle_pd
(
res
,
res
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
float
predux_max
<
Packet16f
>
(
const
Packet16f
&
a
)
{
__m128
lane0
=
_mm512_extractf32x4_ps
(
a
,
0
);
__m128
lane1
=
_mm512_extractf32x4_ps
(
a
,
1
);
__m128
lane2
=
_mm512_extractf32x4_ps
(
a
,
2
);
__m128
lane3
=
_mm512_extractf32x4_ps
(
a
,
3
);
__m128
res
=
_mm_max_ps
(
_mm_max_ps
(
lane0
,
lane1
),
_mm_max_ps
(
lane2
,
lane3
));
res
=
_mm_max_ps
(
res
,
_mm_permute_ps
(
res
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
return
pfirst
(
_mm_max_ps
(
res
,
_mm_permute_ps
(
res
,
_MM_SHUFFLE
(
0
,
0
,
0
,
1
))));
}
template
<
>
EIGEN_STRONG_INLINE
double
predux_max
<
Packet8d
>
(
const
Packet8d
&
a
)
{
__m256d
lane0
=
_mm512_extractf64x4_pd
(
a
,
0
);
__m256d
lane1
=
_mm512_extractf64x4_pd
(
a
,
1
);
__m256d
res
=
_mm256_max_pd
(
lane0
,
lane1
);
res
=
_mm256_max_pd
(
res
,
_mm256_permute2f128_pd
(
res
,
res
,
1
));
return
pfirst
(
_mm256_max_pd
(
res
,
_mm256_shuffle_pd
(
res
,
res
,
1
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
preduxp
<
Packet16f
>
(
const
Packet16f
*
vecs
)
{
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
0
],
vecs0
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
1
],
vecs1
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
2
],
vecs2
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
3
],
vecs3
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
4
],
vecs4
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
5
],
vecs5
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
6
],
vecs6
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
7
],
vecs7
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
8
],
vecs8
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
9
],
vecs9
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
10
],
vecs10
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
11
],
vecs11
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
12
],
vecs12
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
13
],
vecs13
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
14
],
vecs14
);
EIGEN_EXTRACT_8f_FROM_16f
(
vecs
[
15
],
vecs15
);
__m256
hsum1
=
_mm256_hadd_ps
(
vecs0_0
,
vecs1_0
);
__m256
hsum2
=
_mm256_hadd_ps
(
vecs2_0
,
vecs3_0
);
__m256
hsum3
=
_mm256_hadd_ps
(
vecs4_0
,
vecs5_0
);
__m256
hsum4
=
_mm256_hadd_ps
(
vecs6_0
,
vecs7_0
);
__m256
hsum5
=
_mm256_hadd_ps
(
hsum1
,
hsum1
);
__m256
hsum6
=
_mm256_hadd_ps
(
hsum2
,
hsum2
);
__m256
hsum7
=
_mm256_hadd_ps
(
hsum3
,
hsum3
);
__m256
hsum8
=
_mm256_hadd_ps
(
hsum4
,
hsum4
);
__m256
perm1
=
_mm256_permute2f128_ps
(
hsum5
,
hsum5
,
0x23
);
__m256
perm2
=
_mm256_permute2f128_ps
(
hsum6
,
hsum6
,
0x23
);
__m256
perm3
=
_mm256_permute2f128_ps
(
hsum7
,
hsum7
,
0x23
);
__m256
perm4
=
_mm256_permute2f128_ps
(
hsum8
,
hsum8
,
0x23
);
__m256
sum1
=
_mm256_add_ps
(
perm1
,
hsum5
);
__m256
sum2
=
_mm256_add_ps
(
perm2
,
hsum6
);
__m256
sum3
=
_mm256_add_ps
(
perm3
,
hsum7
);
__m256
sum4
=
_mm256_add_ps
(
perm4
,
hsum8
);
__m256
blend1
=
_mm256_blend_ps
(
sum1
,
sum2
,
0xcc
);
__m256
blend2
=
_mm256_blend_ps
(
sum3
,
sum4
,
0xcc
);
__m256
final
=
_mm256_blend_ps
(
blend1
,
blend2
,
0xf0
);
hsum1
=
_mm256_hadd_ps
(
vecs0_1
,
vecs1_1
);
hsum2
=
_mm256_hadd_ps
(
vecs2_1
,
vecs3_1
);
hsum3
=
_mm256_hadd_ps
(
vecs4_1
,
vecs5_1
);
hsum4
=
_mm256_hadd_ps
(
vecs6_1
,
vecs7_1
);
hsum5
=
_mm256_hadd_ps
(
hsum1
,
hsum1
);
hsum6
=
_mm256_hadd_ps
(
hsum2
,
hsum2
);
hsum7
=
_mm256_hadd_ps
(
hsum3
,
hsum3
);
hsum8
=
_mm256_hadd_ps
(
hsum4
,
hsum4
);
perm1
=
_mm256_permute2f128_ps
(
hsum5
,
hsum5
,
0x23
);
perm2
=
_mm256_permute2f128_ps
(
hsum6
,
hsum6
,
0x23
);
perm3
=
_mm256_permute2f128_ps
(
hsum7
,
hsum7
,
0x23
);
perm4
=
_mm256_permute2f128_ps
(
hsum8
,
hsum8
,
0x23
);
sum1
=
_mm256_add_ps
(
perm1
,
hsum5
);
sum2
=
_mm256_add_ps
(
perm2
,
hsum6
);
sum3
=
_mm256_add_ps
(
perm3
,
hsum7
);
sum4
=
_mm256_add_ps
(
perm4
,
hsum8
);
blend1
=
_mm256_blend_ps
(
sum1
,
sum2
,
0xcc
);
blend2
=
_mm256_blend_ps
(
sum3
,
sum4
,
0xcc
);
final
=
padd
(
final
,
_mm256_blend_ps
(
blend1
,
blend2
,
0xf0
));
hsum1
=
_mm256_hadd_ps
(
vecs8_0
,
vecs9_0
);
hsum2
=
_mm256_hadd_ps
(
vecs10_0
,
vecs11_0
);
hsum3
=
_mm256_hadd_ps
(
vecs12_0
,
vecs13_0
);
hsum4
=
_mm256_hadd_ps
(
vecs14_0
,
vecs15_0
);
hsum5
=
_mm256_hadd_ps
(
hsum1
,
hsum1
);
hsum6
=
_mm256_hadd_ps
(
hsum2
,
hsum2
);
hsum7
=
_mm256_hadd_ps
(
hsum3
,
hsum3
);
hsum8
=
_mm256_hadd_ps
(
hsum4
,
hsum4
);
perm1
=
_mm256_permute2f128_ps
(
hsum5
,
hsum5
,
0x23
);
perm2
=
_mm256_permute2f128_ps
(
hsum6
,
hsum6
,
0x23
);
perm3
=
_mm256_permute2f128_ps
(
hsum7
,
hsum7
,
0x23
);
perm4
=
_mm256_permute2f128_ps
(
hsum8
,
hsum8
,
0x23
);
sum1
=
_mm256_add_ps
(
perm1
,
hsum5
);
sum2
=
_mm256_add_ps
(
perm2
,
hsum6
);
sum3
=
_mm256_add_ps
(
perm3
,
hsum7
);
sum4
=
_mm256_add_ps
(
perm4
,
hsum8
);
blend1
=
_mm256_blend_ps
(
sum1
,
sum2
,
0xcc
);
blend2
=
_mm256_blend_ps
(
sum3
,
sum4
,
0xcc
);
__m256
final_1
=
_mm256_blend_ps
(
blend1
,
blend2
,
0xf0
);
hsum1
=
_mm256_hadd_ps
(
vecs8_1
,
vecs9_1
);
hsum2
=
_mm256_hadd_ps
(
vecs10_1
,
vecs11_1
);
hsum3
=
_mm256_hadd_ps
(
vecs12_1
,
vecs13_1
);
hsum4
=
_mm256_hadd_ps
(
vecs14_1
,
vecs15_1
);
hsum5
=
_mm256_hadd_ps
(
hsum1
,
hsum1
);
hsum6
=
_mm256_hadd_ps
(
hsum2
,
hsum2
);
hsum7
=
_mm256_hadd_ps
(
hsum3
,
hsum3
);
hsum8
=
_mm256_hadd_ps
(
hsum4
,
hsum4
);
perm1
=
_mm256_permute2f128_ps
(
hsum5
,
hsum5
,
0x23
);
perm2
=
_mm256_permute2f128_ps
(
hsum6
,
hsum6
,
0x23
);
perm3
=
_mm256_permute2f128_ps
(
hsum7
,
hsum7
,
0x23
);
perm4
=
_mm256_permute2f128_ps
(
hsum8
,
hsum8
,
0x23
);
sum1
=
_mm256_add_ps
(
perm1
,
hsum5
);
sum2
=
_mm256_add_ps
(
perm2
,
hsum6
);
sum3
=
_mm256_add_ps
(
perm3
,
hsum7
);
sum4
=
_mm256_add_ps
(
perm4
,
hsum8
);
blend1
=
_mm256_blend_ps
(
sum1
,
sum2
,
0xcc
);
blend2
=
_mm256_blend_ps
(
sum3
,
sum4
,
0xcc
);
final_1
=
padd
(
final_1
,
_mm256_blend_ps
(
blend1
,
blend2
,
0xf0
));
__m512
final_output
;
EIGEN_INSERT_8f_INTO_16f
(
final_output
,
final
,
final_1
);
return
final_output
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
preduxp
<
Packet8d
>
(
const
Packet8d
*
vecs
)
{
Packet4d
vecs0_0
=
_mm512_extractf64x4_pd
(
vecs
[
0
],
0
);
Packet4d
vecs0_1
=
_mm512_extractf64x4_pd
(
vecs
[
0
],
1
);
Packet4d
vecs1_0
=
_mm512_extractf64x4_pd
(
vecs
[
1
],
0
);
Packet4d
vecs1_1
=
_mm512_extractf64x4_pd
(
vecs
[
1
],
1
);
Packet4d
vecs2_0
=
_mm512_extractf64x4_pd
(
vecs
[
2
],
0
);
Packet4d
vecs2_1
=
_mm512_extractf64x4_pd
(
vecs
[
2
],
1
);
Packet4d
vecs3_0
=
_mm512_extractf64x4_pd
(
vecs
[
3
],
0
);
Packet4d
vecs3_1
=
_mm512_extractf64x4_pd
(
vecs
[
3
],
1
);
Packet4d
vecs4_0
=
_mm512_extractf64x4_pd
(
vecs
[
4
],
0
);
Packet4d
vecs4_1
=
_mm512_extractf64x4_pd
(
vecs
[
4
],
1
);
Packet4d
vecs5_0
=
_mm512_extractf64x4_pd
(
vecs
[
5
],
0
);
Packet4d
vecs5_1
=
_mm512_extractf64x4_pd
(
vecs
[
5
],
1
);
Packet4d
vecs6_0
=
_mm512_extractf64x4_pd
(
vecs
[
6
],
0
);
Packet4d
vecs6_1
=
_mm512_extractf64x4_pd
(
vecs
[
6
],
1
);
Packet4d
vecs7_0
=
_mm512_extractf64x4_pd
(
vecs
[
7
],
0
);
Packet4d
vecs7_1
=
_mm512_extractf64x4_pd
(
vecs
[
7
],
1
);
Packet4d
tmp0
,
tmp1
;
tmp0
=
_mm256_hadd_pd
(
vecs0_0
,
vecs1_0
);
tmp0
=
_mm256_add_pd
(
tmp0
,
_mm256_permute2f128_pd
(
tmp0
,
tmp0
,
1
));
tmp1
=
_mm256_hadd_pd
(
vecs2_0
,
vecs3_0
);
tmp1
=
_mm256_add_pd
(
tmp1
,
_mm256_permute2f128_pd
(
tmp1
,
tmp1
,
1
));
__m256d
final_0
=
_mm256_blend_pd
(
tmp0
,
tmp1
,
0xC
);
tmp0
=
_mm256_hadd_pd
(
vecs0_1
,
vecs1_1
);
tmp0
=
_mm256_add_pd
(
tmp0
,
_mm256_permute2f128_pd
(
tmp0
,
tmp0
,
1
));
tmp1
=
_mm256_hadd_pd
(
vecs2_1
,
vecs3_1
);
tmp1
=
_mm256_add_pd
(
tmp1
,
_mm256_permute2f128_pd
(
tmp1
,
tmp1
,
1
));
final_0
=
padd
(
final_0
,
_mm256_blend_pd
(
tmp0
,
tmp1
,
0xC
));
tmp0
=
_mm256_hadd_pd
(
vecs4_0
,
vecs5_0
);
tmp0
=
_mm256_add_pd
(
tmp0
,
_mm256_permute2f128_pd
(
tmp0
,
tmp0
,
1
));
tmp1
=
_mm256_hadd_pd
(
vecs6_0
,
vecs7_0
);
tmp1
=
_mm256_add_pd
(
tmp1
,
_mm256_permute2f128_pd
(
tmp1
,
tmp1
,
1
));
__m256d
final_1
=
_mm256_blend_pd
(
tmp0
,
tmp1
,
0xC
);
tmp0
=
_mm256_hadd_pd
(
vecs4_1
,
vecs5_1
);
tmp0
=
_mm256_add_pd
(
tmp0
,
_mm256_permute2f128_pd
(
tmp0
,
tmp0
,
1
));
tmp1
=
_mm256_hadd_pd
(
vecs6_1
,
vecs7_1
);
tmp1
=
_mm256_add_pd
(
tmp1
,
_mm256_permute2f128_pd
(
tmp1
,
tmp1
,
1
));
final_1
=
padd
(
final_1
,
_mm256_blend_pd
(
tmp0
,
tmp1
,
0xC
));
__m512d
final_output
=
_mm512_insertf64x4
(
final_output
,
final_0
,
0
);
return
_mm512_insertf64x4
(
final_output
,
final_1
,
1
);
}
#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet16f
,
16
>&
kernel
)
{
__m512
T0
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m512
T1
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m512
T2
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m512
T3
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m512
T4
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
4
],
kernel
.
packet
[
5
]);
__m512
T5
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
4
],
kernel
.
packet
[
5
]);
__m512
T6
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
6
],
kernel
.
packet
[
7
]);
__m512
T7
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
6
],
kernel
.
packet
[
7
]);
__m512
T8
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
8
],
kernel
.
packet
[
9
]);
__m512
T9
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
8
],
kernel
.
packet
[
9
]);
__m512
T10
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
10
],
kernel
.
packet
[
11
]);
__m512
T11
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
10
],
kernel
.
packet
[
11
]);
__m512
T12
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
12
],
kernel
.
packet
[
13
]);
__m512
T13
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
12
],
kernel
.
packet
[
13
]);
__m512
T14
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
14
],
kernel
.
packet
[
15
]);
__m512
T15
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
14
],
kernel
.
packet
[
15
]);
__m512
S0
=
_mm512_shuffle_ps
(
T0
,
T2
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S1
=
_mm512_shuffle_ps
(
T0
,
T2
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S2
=
_mm512_shuffle_ps
(
T1
,
T3
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S3
=
_mm512_shuffle_ps
(
T1
,
T3
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S4
=
_mm512_shuffle_ps
(
T4
,
T6
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S5
=
_mm512_shuffle_ps
(
T4
,
T6
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S6
=
_mm512_shuffle_ps
(
T5
,
T7
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S7
=
_mm512_shuffle_ps
(
T5
,
T7
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S8
=
_mm512_shuffle_ps
(
T8
,
T10
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S9
=
_mm512_shuffle_ps
(
T8
,
T10
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S10
=
_mm512_shuffle_ps
(
T9
,
T11
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S11
=
_mm512_shuffle_ps
(
T9
,
T11
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S12
=
_mm512_shuffle_ps
(
T12
,
T14
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S13
=
_mm512_shuffle_ps
(
T12
,
T14
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S14
=
_mm512_shuffle_ps
(
T13
,
T15
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S15
=
_mm512_shuffle_ps
(
T13
,
T15
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
EIGEN_EXTRACT_8f_FROM_16f
(
S0
,
S0
);
EIGEN_EXTRACT_8f_FROM_16f
(
S1
,
S1
);
EIGEN_EXTRACT_8f_FROM_16f
(
S2
,
S2
);
EIGEN_EXTRACT_8f_FROM_16f
(
S3
,
S3
);
EIGEN_EXTRACT_8f_FROM_16f
(
S4
,
S4
);
EIGEN_EXTRACT_8f_FROM_16f
(
S5
,
S5
);
EIGEN_EXTRACT_8f_FROM_16f
(
S6
,
S6
);
EIGEN_EXTRACT_8f_FROM_16f
(
S7
,
S7
);
EIGEN_EXTRACT_8f_FROM_16f
(
S8
,
S8
);
EIGEN_EXTRACT_8f_FROM_16f
(
S9
,
S9
);
EIGEN_EXTRACT_8f_FROM_16f
(
S10
,
S10
);
EIGEN_EXTRACT_8f_FROM_16f
(
S11
,
S11
);
EIGEN_EXTRACT_8f_FROM_16f
(
S12
,
S12
);
EIGEN_EXTRACT_8f_FROM_16f
(
S13
,
S13
);
EIGEN_EXTRACT_8f_FROM_16f
(
S14
,
S14
);
EIGEN_EXTRACT_8f_FROM_16f
(
S15
,
S15
);
PacketBlock
<
Packet8f
,
32
>
tmp
;
tmp
.
packet
[
0
]
=
_mm256_permute2f128_ps
(
S0_0
,
S4_0
,
0x20
);
tmp
.
packet
[
1
]
=
_mm256_permute2f128_ps
(
S1_0
,
S5_0
,
0x20
);
tmp
.
packet
[
2
]
=
_mm256_permute2f128_ps
(
S2_0
,
S6_0
,
0x20
);
tmp
.
packet
[
3
]
=
_mm256_permute2f128_ps
(
S3_0
,
S7_0
,
0x20
);
tmp
.
packet
[
4
]
=
_mm256_permute2f128_ps
(
S0_0
,
S4_0
,
0x31
);
tmp
.
packet
[
5
]
=
_mm256_permute2f128_ps
(
S1_0
,
S5_0
,
0x31
);
tmp
.
packet
[
6
]
=
_mm256_permute2f128_ps
(
S2_0
,
S6_0
,
0x31
);
tmp
.
packet
[
7
]
=
_mm256_permute2f128_ps
(
S3_0
,
S7_0
,
0x31
);
tmp
.
packet
[
8
]
=
_mm256_permute2f128_ps
(
S0_1
,
S4_1
,
0x20
);
tmp
.
packet
[
9
]
=
_mm256_permute2f128_ps
(
S1_1
,
S5_1
,
0x20
);
tmp
.
packet
[
10
]
=
_mm256_permute2f128_ps
(
S2_1
,
S6_1
,
0x20
);
tmp
.
packet
[
11
]
=
_mm256_permute2f128_ps
(
S3_1
,
S7_1
,
0x20
);
tmp
.
packet
[
12
]
=
_mm256_permute2f128_ps
(
S0_1
,
S4_1
,
0x31
);
tmp
.
packet
[
13
]
=
_mm256_permute2f128_ps
(
S1_1
,
S5_1
,
0x31
);
tmp
.
packet
[
14
]
=
_mm256_permute2f128_ps
(
S2_1
,
S6_1
,
0x31
);
tmp
.
packet
[
15
]
=
_mm256_permute2f128_ps
(
S3_1
,
S7_1
,
0x31
);
// Second set of _m256 outputs
tmp
.
packet
[
16
]
=
_mm256_permute2f128_ps
(
S8_0
,
S12_0
,
0x20
);
tmp
.
packet
[
17
]
=
_mm256_permute2f128_ps
(
S9_0
,
S13_0
,
0x20
);
tmp
.
packet
[
18
]
=
_mm256_permute2f128_ps
(
S10_0
,
S14_0
,
0x20
);
tmp
.
packet
[
19
]
=
_mm256_permute2f128_ps
(
S11_0
,
S15_0
,
0x20
);
tmp
.
packet
[
20
]
=
_mm256_permute2f128_ps
(
S8_0
,
S12_0
,
0x31
);
tmp
.
packet
[
21
]
=
_mm256_permute2f128_ps
(
S9_0
,
S13_0
,
0x31
);
tmp
.
packet
[
22
]
=
_mm256_permute2f128_ps
(
S10_0
,
S14_0
,
0x31
);
tmp
.
packet
[
23
]
=
_mm256_permute2f128_ps
(
S11_0
,
S15_0
,
0x31
);
tmp
.
packet
[
24
]
=
_mm256_permute2f128_ps
(
S8_1
,
S12_1
,
0x20
);
tmp
.
packet
[
25
]
=
_mm256_permute2f128_ps
(
S9_1
,
S13_1
,
0x20
);
tmp
.
packet
[
26
]
=
_mm256_permute2f128_ps
(
S10_1
,
S14_1
,
0x20
);
tmp
.
packet
[
27
]
=
_mm256_permute2f128_ps
(
S11_1
,
S15_1
,
0x20
);
tmp
.
packet
[
28
]
=
_mm256_permute2f128_ps
(
S8_1
,
S12_1
,
0x31
);
tmp
.
packet
[
29
]
=
_mm256_permute2f128_ps
(
S9_1
,
S13_1
,
0x31
);
tmp
.
packet
[
30
]
=
_mm256_permute2f128_ps
(
S10_1
,
S14_1
,
0x31
);
tmp
.
packet
[
31
]
=
_mm256_permute2f128_ps
(
S11_1
,
S15_1
,
0x31
);
// Pack them into the output
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
0
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
1
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
2
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
3
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
4
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
5
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
6
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
7
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
8
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
9
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
10
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
11
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
12
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
13
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
14
,
16
);
PACK_OUTPUT
(
kernel
.
packet
,
tmp
.
packet
,
15
,
16
);
}
#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \
EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \
INPUT[2 * INDEX + STRIDE]);
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet16f
,
4
>&
kernel
)
{
__m512
T0
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m512
T1
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m512
T2
=
_mm512_unpacklo_ps
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m512
T3
=
_mm512_unpackhi_ps
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m512
S0
=
_mm512_shuffle_ps
(
T0
,
T2
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S1
=
_mm512_shuffle_ps
(
T0
,
T2
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
__m512
S2
=
_mm512_shuffle_ps
(
T1
,
T3
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
__m512
S3
=
_mm512_shuffle_ps
(
T1
,
T3
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
EIGEN_EXTRACT_8f_FROM_16f
(
S0
,
S0
);
EIGEN_EXTRACT_8f_FROM_16f
(
S1
,
S1
);
EIGEN_EXTRACT_8f_FROM_16f
(
S2
,
S2
);
EIGEN_EXTRACT_8f_FROM_16f
(
S3
,
S3
);
PacketBlock
<
Packet8f
,
8
>
tmp
;
tmp
.
packet
[
0
]
=
_mm256_permute2f128_ps
(
S0_0
,
S1_0
,
0x20
);
tmp
.
packet
[
1
]
=
_mm256_permute2f128_ps
(
S2_0
,
S3_0
,
0x20
);
tmp
.
packet
[
2
]
=
_mm256_permute2f128_ps
(
S0_0
,
S1_0
,
0x31
);
tmp
.
packet
[
3
]
=
_mm256_permute2f128_ps
(
S2_0
,
S3_0
,
0x31
);
tmp
.
packet
[
4
]
=
_mm256_permute2f128_ps
(
S0_1
,
S1_1
,
0x20
);
tmp
.
packet
[
5
]
=
_mm256_permute2f128_ps
(
S2_1
,
S3_1
,
0x20
);
tmp
.
packet
[
6
]
=
_mm256_permute2f128_ps
(
S0_1
,
S1_1
,
0x31
);
tmp
.
packet
[
7
]
=
_mm256_permute2f128_ps
(
S2_1
,
S3_1
,
0x31
);
PACK_OUTPUT_2
(
kernel
.
packet
,
tmp
.
packet
,
0
,
1
);
PACK_OUTPUT_2
(
kernel
.
packet
,
tmp
.
packet
,
1
,
1
);
PACK_OUTPUT_2
(
kernel
.
packet
,
tmp
.
packet
,
2
,
1
);
PACK_OUTPUT_2
(
kernel
.
packet
,
tmp
.
packet
,
3
,
1
);
}
#define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE) \
OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \
OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1);
#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \
OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
OUTPUT[INDEX] = \
_mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet8d
,
4
>&
kernel
)
{
__m512d
T0
=
_mm512_shuffle_pd
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
],
0
);
__m512d
T1
=
_mm512_shuffle_pd
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
],
0xff
);
__m512d
T2
=
_mm512_shuffle_pd
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
],
0
);
__m512d
T3
=
_mm512_shuffle_pd
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
],
0xff
);
PacketBlock
<
Packet4d
,
8
>
tmp
;
tmp
.
packet
[
0
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
0
),
_mm512_extractf64x4_pd
(
T2
,
0
),
0x20
);
tmp
.
packet
[
1
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
0
),
_mm512_extractf64x4_pd
(
T3
,
0
),
0x20
);
tmp
.
packet
[
2
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
0
),
_mm512_extractf64x4_pd
(
T2
,
0
),
0x31
);
tmp
.
packet
[
3
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
0
),
_mm512_extractf64x4_pd
(
T3
,
0
),
0x31
);
tmp
.
packet
[
4
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
1
),
_mm512_extractf64x4_pd
(
T2
,
1
),
0x20
);
tmp
.
packet
[
5
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
1
),
_mm512_extractf64x4_pd
(
T3
,
1
),
0x20
);
tmp
.
packet
[
6
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
1
),
_mm512_extractf64x4_pd
(
T2
,
1
),
0x31
);
tmp
.
packet
[
7
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
1
),
_mm512_extractf64x4_pd
(
T3
,
1
),
0x31
);
PACK_OUTPUT_D
(
kernel
.
packet
,
tmp
.
packet
,
0
,
1
);
PACK_OUTPUT_D
(
kernel
.
packet
,
tmp
.
packet
,
1
,
1
);
PACK_OUTPUT_D
(
kernel
.
packet
,
tmp
.
packet
,
2
,
1
);
PACK_OUTPUT_D
(
kernel
.
packet
,
tmp
.
packet
,
3
,
1
);
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet8d
,
8
>&
kernel
)
{
__m512d
T0
=
_mm512_unpacklo_pd
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m512d
T1
=
_mm512_unpackhi_pd
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
__m512d
T2
=
_mm512_unpacklo_pd
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m512d
T3
=
_mm512_unpackhi_pd
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
__m512d
T4
=
_mm512_unpacklo_pd
(
kernel
.
packet
[
4
],
kernel
.
packet
[
5
]);
__m512d
T5
=
_mm512_unpackhi_pd
(
kernel
.
packet
[
4
],
kernel
.
packet
[
5
]);
__m512d
T6
=
_mm512_unpacklo_pd
(
kernel
.
packet
[
6
],
kernel
.
packet
[
7
]);
__m512d
T7
=
_mm512_unpackhi_pd
(
kernel
.
packet
[
6
],
kernel
.
packet
[
7
]);
PacketBlock
<
Packet4d
,
16
>
tmp
;
tmp
.
packet
[
0
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
0
),
_mm512_extractf64x4_pd
(
T2
,
0
),
0x20
);
tmp
.
packet
[
1
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
0
),
_mm512_extractf64x4_pd
(
T3
,
0
),
0x20
);
tmp
.
packet
[
2
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
0
),
_mm512_extractf64x4_pd
(
T2
,
0
),
0x31
);
tmp
.
packet
[
3
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
0
),
_mm512_extractf64x4_pd
(
T3
,
0
),
0x31
);
tmp
.
packet
[
4
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
1
),
_mm512_extractf64x4_pd
(
T2
,
1
),
0x20
);
tmp
.
packet
[
5
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
1
),
_mm512_extractf64x4_pd
(
T3
,
1
),
0x20
);
tmp
.
packet
[
6
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T0
,
1
),
_mm512_extractf64x4_pd
(
T2
,
1
),
0x31
);
tmp
.
packet
[
7
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T1
,
1
),
_mm512_extractf64x4_pd
(
T3
,
1
),
0x31
);
tmp
.
packet
[
8
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T4
,
0
),
_mm512_extractf64x4_pd
(
T6
,
0
),
0x20
);
tmp
.
packet
[
9
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T5
,
0
),
_mm512_extractf64x4_pd
(
T7
,
0
),
0x20
);
tmp
.
packet
[
10
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T4
,
0
),
_mm512_extractf64x4_pd
(
T6
,
0
),
0x31
);
tmp
.
packet
[
11
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T5
,
0
),
_mm512_extractf64x4_pd
(
T7
,
0
),
0x31
);
tmp
.
packet
[
12
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T4
,
1
),
_mm512_extractf64x4_pd
(
T6
,
1
),
0x20
);
tmp
.
packet
[
13
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T5
,
1
),
_mm512_extractf64x4_pd
(
T7
,
1
),
0x20
);
tmp
.
packet
[
14
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T4
,
1
),
_mm512_extractf64x4_pd
(
T6
,
1
),
0x31
);
tmp
.
packet
[
15
]
=
_mm256_permute2f128_pd
(
_mm512_extractf64x4_pd
(
T5
,
1
),
_mm512_extractf64x4_pd
(
T7
,
1
),
0x31
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
0
,
8
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
1
,
8
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
2
,
8
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
3
,
8
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
4
,
8
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
5
,
8
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
6
,
8
);
PACK_OUTPUT_SQ_D
(
kernel
.
packet
,
tmp
.
packet
,
7
,
8
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pblend
(
const
Selector
<
16
>&
/*ifPacket*/
,
const
Packet16f
&
/*thenPacket*/
,
const
Packet16f
&
/*elsePacket*/
)
{
assert
(
false
&&
"To be implemented"
);
return
Packet16f
();
}
template
<
>
EIGEN_STRONG_INLINE
Packet8d
pblend
(
const
Selector
<
8
>&
ifPacket
,
const
Packet8d
&
thenPacket
,
const
Packet8d
&
elsePacket
)
{
__mmask8
m
=
(
ifPacket
.
select
[
0
]
)
|
(
ifPacket
.
select
[
1
]
<<
1
)
|
(
ifPacket
.
select
[
2
]
<<
2
)
|
(
ifPacket
.
select
[
3
]
<<
3
)
|
(
ifPacket
.
select
[
4
]
<<
4
)
|
(
ifPacket
.
select
[
5
]
<<
5
)
|
(
ifPacket
.
select
[
6
]
<<
6
)
|
(
ifPacket
.
select
[
7
]
<<
7
);
return
_mm512_mask_blend_pd
(
m
,
elsePacket
,
thenPacket
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16i
pcast
<
Packet16f
,
Packet16i
>
(
const
Packet16f
&
a
)
{
return
_mm512_cvttps_epi32
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pcast
<
Packet16i
,
Packet16f
>
(
const
Packet16i
&
a
)
{
return
_mm512_cvtepi32_ps
(
a
);
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet16f
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet16f
&
first
,
const
Packet16f
&
second
)
{
if
(
Offset
!=
0
)
{
__m512i
first_idx
=
_mm512_set_epi32
(
Offset
+
15
,
Offset
+
14
,
Offset
+
13
,
Offset
+
12
,
Offset
+
11
,
Offset
+
10
,
Offset
+
9
,
Offset
+
8
,
Offset
+
7
,
Offset
+
6
,
Offset
+
5
,
Offset
+
4
,
Offset
+
3
,
Offset
+
2
,
Offset
+
1
,
Offset
);
__m512i
second_idx
=
_mm512_set_epi32
(
Offset
-
1
,
Offset
-
2
,
Offset
-
3
,
Offset
-
4
,
Offset
-
5
,
Offset
-
6
,
Offset
-
7
,
Offset
-
8
,
Offset
-
9
,
Offset
-
10
,
Offset
-
11
,
Offset
-
12
,
Offset
-
13
,
Offset
-
14
,
Offset
-
15
,
Offset
-
16
);
unsigned
short
mask
=
0xFFFF
;
mask
<<=
(
16
-
Offset
);
first
=
_mm512_permutexvar_ps
(
first_idx
,
first
);
Packet16f
tmp
=
_mm512_permutexvar_ps
(
second_idx
,
second
);
first
=
_mm512_mask_blend_ps
(
mask
,
first
,
tmp
);
}
}
};
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet8d
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet8d
&
first
,
const
Packet8d
&
second
)
{
if
(
Offset
!=
0
)
{
__m512i
first_idx
=
_mm512_set_epi32
(
0
,
Offset
+
7
,
0
,
Offset
+
6
,
0
,
Offset
+
5
,
0
,
Offset
+
4
,
0
,
Offset
+
3
,
0
,
Offset
+
2
,
0
,
Offset
+
1
,
0
,
Offset
);
__m512i
second_idx
=
_mm512_set_epi32
(
0
,
Offset
-
1
,
0
,
Offset
-
2
,
0
,
Offset
-
3
,
0
,
Offset
-
4
,
0
,
Offset
-
5
,
0
,
Offset
-
6
,
0
,
Offset
-
7
,
0
,
Offset
-
8
);
unsigned
char
mask
=
0xFF
;
mask
<<=
(
8
-
Offset
);
first
=
_mm512_permutexvar_pd
(
first_idx
,
first
);
Packet8d
tmp
=
_mm512_permutexvar_pd
(
second_idx
,
second
);
first
=
_mm512_mask_blend_pd
(
mask
,
first
,
tmp
);
}
}
};
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_PACKET_MATH_AVX512_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/AltiVec/Complex.h
View file @
13b115ab
...
...
@@ -2,30 +2,34 @@
// for linear algebra.
//
// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2010-2016 Konstantinos Margaritis <markos@freevec.org>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_ALTIVEC_H
#define EIGEN_COMPLEX_ALTIVEC_H
#ifndef EIGEN_COMPLEX
32
_ALTIVEC_H
#define EIGEN_COMPLEX
32
_ALTIVEC_H
namespace
Eigen
{
namespace
internal
{
static
Packet4ui
p4ui_CONJ_XOR
=
vec_mergeh
((
Packet4ui
)
p4i_ZERO
,
(
Packet4ui
)
p4f_ZERO_
);
//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
static
Packet16uc
p16uc_COMPLEX_RE
=
vec_sld
((
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
0
),
(
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
2
),
8
);
//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
static
Packet16uc
p16uc_COMPLEX_IM
=
vec_sld
((
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
1
),
(
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
3
),
8
);
//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
static
Packet16uc
p16uc_COMPLEX_REV
=
vec_sld
(
p16uc_REVERSE
,
p16uc_REVERSE
,
8
);
//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
static
Packet16uc
p16uc_COMPLEX_REV2
=
vec_sld
(
p16uc_FORWARD
,
p16uc_FORWARD
,
8
);
//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
static
Packet16uc
p16uc_PSET_HI
=
(
Packet16uc
)
vec_mergeh
((
Packet4ui
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
0
),
(
Packet4ui
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
1
));
//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
static
Packet16uc
p16uc_PSET_LO
=
(
Packet16uc
)
vec_mergeh
((
Packet4ui
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
2
),
(
Packet4ui
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
3
));
//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
static
Packet4ui
p4ui_CONJ_XOR
=
vec_mergeh
((
Packet4ui
)
p4i_ZERO
,
(
Packet4ui
)
p4f_MZERO
);
//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
#ifdef __VSX__
#if defined(_BIG_ENDIAN)
static
Packet2ul
p2ul_CONJ_XOR1
=
(
Packet2ul
)
vec_sld
((
Packet4ui
)
p2d_MZERO
,
(
Packet4ui
)
p2l_ZERO
,
8
);
//{ 0x8000000000000000, 0x0000000000000000 };
static
Packet2ul
p2ul_CONJ_XOR2
=
(
Packet2ul
)
vec_sld
((
Packet4ui
)
p2l_ZERO
,
(
Packet4ui
)
p2d_MZERO
,
8
);
//{ 0x8000000000000000, 0x0000000000000000 };
#else
static
Packet2ul
p2ul_CONJ_XOR1
=
(
Packet2ul
)
vec_sld
((
Packet4ui
)
p2l_ZERO
,
(
Packet4ui
)
p2d_MZERO
,
8
);
//{ 0x8000000000000000, 0x0000000000000000 };
static
Packet2ul
p2ul_CONJ_XOR2
=
(
Packet2ul
)
vec_sld
((
Packet4ui
)
p2d_MZERO
,
(
Packet4ui
)
p2l_ZERO
,
8
);
//{ 0x8000000000000000, 0x0000000000000000 };
#endif
#endif
//---------- float ----------
struct
Packet2cf
{
EIGEN_STRONG_INLINE
Packet2cf
()
{}
EIGEN_STRONG_INLINE
explicit
Packet2cf
()
:
v
(
p4f_ZERO
)
{}
EIGEN_STRONG_INLINE
explicit
Packet2cf
(
const
Packet4f
&
a
)
:
v
(
a
)
{}
Packet4f
v
;
};
...
...
@@ -33,10 +37,12 @@ struct Packet2cf
template
<
>
struct
packet_traits
<
std
::
complex
<
float
>
>
:
default_packet_traits
{
typedef
Packet2cf
type
;
typedef
Packet2cf
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
...
...
@@ -47,65 +53,78 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
#ifdef __VSX__
HasBlend
=
1
,
#endif
HasSetLinear
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet2cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
2
}
;
};
template
<
>
struct
unpacket_traits
<
Packet2cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
Packet2cf
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pset1
<
Packet2cf
>
(
const
std
::
complex
<
float
>&
from
)
{
Packet2cf
res
;
/* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
if
((
ptrdiff_t
(
&
from
)
%
16
)
==
0
)
if
((
std
::
ptrdiff_t
(
&
from
)
%
16
)
==
0
)
res
.
v
=
pload
<
Packet4f
>
((
const
float
*
)
&
from
);
else
res
.
v
=
ploadu
<
Packet4f
>
((
const
float
*
)
&
from
);
res
.
v
=
vec_perm
(
res
.
v
,
res
.
v
,
p16uc_PSET_HI
);
res
.
v
=
vec_perm
(
res
.
v
,
res
.
v
,
p16uc_PSET
64
_HI
);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
padd
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vec_add
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
psub
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vec_sub
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pload
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
return
Packet2cf
(
pload
<
Packet4f
>
((
const
float
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploadu
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
return
Packet2cf
(
ploadu
<
Packet4f
>
((
const
float
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploaddup
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
return
pset1
<
Packet2cf
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
pstore
((
float
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
pstoreu
((
float
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet2cf
pgather
<
std
::
complex
<
float
>
,
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
,
Index
stride
)
{
std
::
complex
<
float
>
EIGEN_ALIGN16
af
[
2
];
af
[
0
]
=
from
[
0
*
stride
];
af
[
1
]
=
from
[
1
*
stride
];
return
pload
<
Packet2cf
>
(
af
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
std
::
complex
<
float
>
,
Packet2cf
>
(
std
::
complex
<
float
>*
to
,
const
Packet2cf
&
from
,
Index
stride
)
{
std
::
complex
<
float
>
EIGEN_ALIGN16
af
[
2
];
pstore
<
std
::
complex
<
float
>
>
((
std
::
complex
<
float
>
*
)
af
,
from
);
to
[
0
*
stride
]
=
af
[
0
];
to
[
1
*
stride
]
=
af
[
1
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
padd
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
a
.
v
+
b
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
psub
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
a
.
v
-
b
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pnegate
(
const
Packet2cf
&
a
)
{
return
Packet2cf
(
pnegate
(
a
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pconj
(
const
Packet2cf
&
a
)
{
return
Packet2cf
(
(
Packet4f
)
vec_xor
((
Packet4ui
)
a
.
v
,
p4ui_CONJ_XOR
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pconj
(
const
Packet2cf
&
a
)
{
return
Packet2cf
(
pxor
<
Packet4f
>
(
a
.
v
,
reinterpret_cast
<
Packet4f
>
(
p4ui_CONJ_XOR
))
)
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pmul
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
Packet4f
v1
,
v2
;
// Permute and multiply the real parts of a and b
v1
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_
COMPLEX_RE
);
v1
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_
PSET32_WODD
);
// Get the imaginary parts of a
v2
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_
COMPLEX_IM
);
v2
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_
PSET32_WEVEN
);
// multiply a_re * b
v1
=
vec_madd
(
v1
,
b
.
v
,
p4f_ZERO
);
// multiply a_im * b and get the conjugate result
v2
=
vec_madd
(
v2
,
b
.
v
,
p4f_ZERO
);
v2
=
(
Packet4f
)
vec_xor
((
Packet4ui
)
v2
,
p4ui_CONJ_XOR
);
v2
=
reinterpret_cast
<
Packet4f
>
(
pxor
(
v2
,
reinterpret_cast
<
Packet4f
>
(
p4ui_CONJ_XOR
)
))
;
// permute back to a proper order
v2
=
vec_perm
(
v2
,
v2
,
p16uc_COMPLEX_REV
);
v2
=
vec_perm
(
v2
,
v2
,
p16uc_COMPLEX
32
_REV
);
return
Packet2cf
(
vec_add
(
v1
,
v2
));
return
Packet2cf
(
padd
<
Packet4f
>
(
v1
,
v2
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pand
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vec_and
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
por
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vec_or
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pxor
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vec_xor
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pandnot
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vec_and
(
a
.
v
,
vec_nor
(
b
.
v
,
b
.
v
)
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pand
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
pand
<
Packet4f
>
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
por
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
por
<
Packet4f
>
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pxor
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
pxor
<
Packet4f
>
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pandnot
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
pandnot
<
Packet4f
>
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pload
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
Packet2cf
(
pload
<
Packet4f
>
((
const
float
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploadu
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
Packet2cf
(
ploadu
<
Packet4f
>
((
const
float
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploaddup
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
return
pset1
<
Packet2cf
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
((
float
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
((
float
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
float
>
>
(
const
std
::
complex
<
float
>
*
addr
)
{
vec_dstt
((
float
*
)
addr
,
DST_CTRL
(
2
,
2
,
32
),
DST_CHAN
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
float
>
>
(
const
std
::
complex
<
float
>
*
addr
)
{
EIGEN_PPC_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
pfirst
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
...
...
@@ -118,26 +137,30 @@ template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Pack
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
preverse
(
const
Packet2cf
&
a
)
{
Packet4f
rev_a
;
rev_a
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_COMPLEX_REV2
);
rev_a
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_COMPLEX
32
_REV2
);
return
Packet2cf
(
rev_a
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
predux
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
Packet4f
b
;
b
=
(
Packet4f
)
vec_sld
(
a
.
v
,
a
.
v
,
8
);
b
=
padd
(
a
.
v
,
b
);
return
pfirst
(
Packet2cf
(
b
));
b
=
vec_sld
(
a
.
v
,
a
.
v
,
8
);
b
=
padd
<
Packet4f
>
(
a
.
v
,
b
);
return
pfirst
<
Packet2cf
>
(
Packet2cf
(
b
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
preduxp
<
Packet2cf
>
(
const
Packet2cf
*
vecs
)
{
Packet4f
b1
,
b2
;
b1
=
(
Packet4f
)
vec_sld
(
vecs
[
0
].
v
,
vecs
[
1
].
v
,
8
);
b2
=
(
Packet4f
)
vec_sld
(
vecs
[
1
].
v
,
vecs
[
0
].
v
,
8
);
b2
=
(
Packet4f
)
vec_sld
(
b2
,
b2
,
8
);
b2
=
padd
(
b1
,
b2
);
#ifdef _BIG_ENDIAN
b1
=
vec_sld
(
vecs
[
0
].
v
,
vecs
[
1
].
v
,
8
);
b2
=
vec_sld
(
vecs
[
1
].
v
,
vecs
[
0
].
v
,
8
);
#else
b1
=
vec_sld
(
vecs
[
1
].
v
,
vecs
[
0
].
v
,
8
);
b2
=
vec_sld
(
vecs
[
0
].
v
,
vecs
[
1
].
v
,
8
);
#endif
b2
=
vec_sld
(
b2
,
b2
,
8
);
b2
=
padd
<
Packet4f
>
(
b1
,
b2
);
return
Packet2cf
(
b2
);
}
...
...
@@ -146,10 +169,10 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
{
Packet4f
b
;
Packet2cf
prod
;
b
=
(
Packet4f
)
vec_sld
(
a
.
v
,
a
.
v
,
8
);
prod
=
pmul
(
a
,
Packet2cf
(
b
));
b
=
vec_sld
(
a
.
v
,
a
.
v
,
8
);
prod
=
pmul
<
Packet2cf
>
(
a
,
Packet2cf
(
b
));
return
pfirst
(
prod
);
return
pfirst
<
Packet2cf
>
(
prod
);
}
template
<
int
Offset
>
...
...
@@ -159,7 +182,11 @@ struct palign_impl<Offset,Packet2cf>
{
if
(
Offset
==
1
)
{
#ifdef _BIG_ENDIAN
first
.
v
=
vec_sld
(
first
.
v
,
second
.
v
,
8
);
#else
first
.
v
=
vec_sld
(
second
.
v
,
first
.
v
,
8
);
#endif
}
}
};
...
...
@@ -197,21 +224,207 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL
(
Packet2cf
,
Packet4f
)
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pdiv
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
// TODO optimize it for AltiVec
Packet2cf
res
=
conj_helper
<
Packet2cf
,
Packet2cf
,
false
,
true
>
().
pmul
(
a
,
b
);
Packet4f
s
=
vec_madd
(
b
.
v
,
b
.
v
,
p4f_ZERO
);
return
Packet2cf
(
pdiv
(
res
.
v
,
vec_add
(
s
,
vec_perm
(
s
,
s
,
p16uc_COMPLEX_REV
))));
Packet2cf
res
=
conj_helper
<
Packet2cf
,
Packet2cf
,
false
,
true
>
().
pmul
(
a
,
b
);
Packet4f
s
=
pmul
<
Packet4f
>
(
b
.
v
,
b
.
v
);
return
Packet2cf
(
pdiv
(
res
.
v
,
padd
<
Packet4f
>
(
s
,
vec_perm
(
s
,
s
,
p16uc_COMPLEX
32
_REV
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pcplxflip
<
Packet2cf
>
(
const
Packet2cf
&
x
)
{
return
Packet2cf
(
vec_perm
(
x
.
v
,
x
.
v
,
p16uc_COMPLEX_REV
));
return
Packet2cf
(
vec_perm
(
x
.
v
,
x
.
v
,
p16uc_COMPLEX32_REV
));
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet2cf
,
2
>&
kernel
)
{
Packet4f
tmp
=
vec_perm
(
kernel
.
packet
[
0
].
v
,
kernel
.
packet
[
1
].
v
,
p16uc_TRANSPOSE64_HI
);
kernel
.
packet
[
1
].
v
=
vec_perm
(
kernel
.
packet
[
0
].
v
,
kernel
.
packet
[
1
].
v
,
p16uc_TRANSPOSE64_LO
);
kernel
.
packet
[
0
].
v
=
tmp
;
}
#ifdef __VSX__
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pblend
(
const
Selector
<
2
>&
ifPacket
,
const
Packet2cf
&
thenPacket
,
const
Packet2cf
&
elsePacket
)
{
Packet2cf
result
;
result
.
v
=
reinterpret_cast
<
Packet4f
>
(
pblend
<
Packet2d
>
(
ifPacket
,
reinterpret_cast
<
Packet2d
>
(
thenPacket
.
v
),
reinterpret_cast
<
Packet2d
>
(
elsePacket
.
v
)));
return
result
;
}
#endif
//---------- double ----------
#ifdef __VSX__
struct
Packet1cd
{
EIGEN_STRONG_INLINE
Packet1cd
()
{}
EIGEN_STRONG_INLINE
explicit
Packet1cd
(
const
Packet2d
&
a
)
:
v
(
a
)
{}
Packet2d
v
;
};
template
<
>
struct
packet_traits
<
std
::
complex
<
double
>
>
:
default_packet_traits
{
typedef
Packet1cd
type
;
typedef
Packet1cd
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
0
,
size
=
1
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasNegate
=
1
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasSetLinear
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet1cd
>
{
typedef
std
::
complex
<
double
>
type
;
enum
{
size
=
1
,
alignment
=
Aligned16
};
typedef
Packet1cd
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pload
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
return
Packet1cd
(
pload
<
Packet2d
>
((
const
double
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
ploadu
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
return
Packet1cd
(
ploadu
<
Packet2d
>
((
const
double
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
pstore
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
pstoreu
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pset1
<
Packet1cd
>
(
const
std
::
complex
<
double
>&
from
)
{
/* here we really have to use unaligned loads :( */
return
ploadu
<
Packet1cd
>
(
&
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet1cd
pgather
<
std
::
complex
<
double
>
,
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
,
Index
stride
)
{
std
::
complex
<
double
>
EIGEN_ALIGN16
af
[
2
];
af
[
0
]
=
from
[
0
*
stride
];
af
[
1
]
=
from
[
1
*
stride
];
return
pload
<
Packet1cd
>
(
af
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
std
::
complex
<
double
>
,
Packet1cd
>
(
std
::
complex
<
double
>*
to
,
const
Packet1cd
&
from
,
Index
stride
)
{
std
::
complex
<
double
>
EIGEN_ALIGN16
af
[
2
];
pstore
<
std
::
complex
<
double
>
>
(
af
,
from
);
to
[
0
*
stride
]
=
af
[
0
];
to
[
1
*
stride
]
=
af
[
1
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
padd
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
a
.
v
+
b
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
psub
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
a
.
v
-
b
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pnegate
(
const
Packet1cd
&
a
)
{
return
Packet1cd
(
pnegate
(
Packet2d
(
a
.
v
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pconj
(
const
Packet1cd
&
a
)
{
return
Packet1cd
(
pxor
(
a
.
v
,
reinterpret_cast
<
Packet2d
>
(
p2ul_CONJ_XOR2
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pmul
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
Packet2d
a_re
,
a_im
,
v1
,
v2
;
// Permute and multiply the real parts of a and b
a_re
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_PSET64_HI
);
// Get the imaginary parts of a
a_im
=
vec_perm
(
a
.
v
,
a
.
v
,
p16uc_PSET64_LO
);
// multiply a_re * b
v1
=
vec_madd
(
a_re
,
b
.
v
,
p2d_ZERO
);
// multiply a_im * b and get the conjugate result
v2
=
vec_madd
(
a_im
,
b
.
v
,
p2d_ZERO
);
v2
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
v2
),
reinterpret_cast
<
Packet4ui
>
(
v2
),
8
));
v2
=
pxor
(
v2
,
reinterpret_cast
<
Packet2d
>
(
p2ul_CONJ_XOR1
));
return
Packet1cd
(
padd
<
Packet2d
>
(
v1
,
v2
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pand
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
pand
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
por
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
por
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pxor
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
pxor
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pandnot
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
pandnot
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
ploaddup
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
return
pset1
<
Packet1cd
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
double
>
>
(
const
std
::
complex
<
double
>
*
addr
)
{
EIGEN_PPC_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
pfirst
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
std
::
complex
<
double
>
EIGEN_ALIGN16
res
[
2
];
pstore
<
std
::
complex
<
double
>
>
(
res
,
a
);
return
res
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
preverse
(
const
Packet1cd
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
predux
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
return
pfirst
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
preduxp
<
Packet1cd
>
(
const
Packet1cd
*
vecs
)
{
return
vecs
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
predux_mul
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
return
pfirst
(
a
);
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet1cd
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet1cd
&
/*first*/
,
const
Packet1cd
&
/*second*/
)
{
// FIXME is it sure we never have to align a Packet1cd?
// Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
false
,
true
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
return
internal
::
pmul
(
a
,
pconj
(
b
));
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
true
,
false
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
return
internal
::
pmul
(
pconj
(
a
),
b
);
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
true
,
true
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
return
pconj
(
internal
::
pmul
(
a
,
b
));
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL
(
Packet1cd
,
Packet2d
)
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pdiv
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
// TODO optimize it for AltiVec
Packet1cd
res
=
conj_helper
<
Packet1cd
,
Packet1cd
,
false
,
true
>
().
pmul
(
a
,
b
);
Packet2d
s
=
pmul
<
Packet2d
>
(
b
.
v
,
b
.
v
);
return
Packet1cd
(
pdiv
(
res
.
v
,
padd
<
Packet2d
>
(
s
,
vec_perm
(
s
,
s
,
p16uc_REVERSE64
))));
}
EIGEN_STRONG_INLINE
Packet1cd
pcplxflip
/*<Packet1cd>*/
(
const
Packet1cd
&
x
)
{
return
Packet1cd
(
preverse
(
Packet2d
(
x
.
v
)));
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet1cd
,
2
>&
kernel
)
{
Packet2d
tmp
=
vec_perm
(
kernel
.
packet
[
0
].
v
,
kernel
.
packet
[
1
].
v
,
p16uc_TRANSPOSE64_HI
);
kernel
.
packet
[
1
].
v
=
vec_perm
(
kernel
.
packet
[
0
].
v
,
kernel
.
packet
[
1
].
v
,
p16uc_TRANSPOSE64_LO
);
kernel
.
packet
[
0
].
v
=
tmp
;
}
#endif // __VSX__
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_COMPLEX_ALTIVEC_H
#endif // EIGEN_COMPLEX
32
_ALTIVEC_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/AltiVec/MathFunctions.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2007 Julien Pommier
// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/* The sin, cos, exp, and log functions of this file come from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
namespace
Eigen
{
namespace
internal
{
static
_EIGEN_DECLARE_CONST_Packet4f
(
1
,
1.0
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
half
,
0.5
f
);
static
_EIGEN_DECLARE_CONST_Packet4i
(
0x7f
,
0x7f
);
static
_EIGEN_DECLARE_CONST_Packet4i
(
23
,
23
);
static
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
inv_mant_mask
,
~
0x7f800000
);
/* the smallest non denormalized float number */
static
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
min_norm_pos
,
0x00800000
);
static
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
minus_inf
,
0xff800000
);
// -1.f/0.f
static
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT
(
minus_nan
,
0xffffffff
);
/* natural logarithm computed for 4 simultaneous float
return NaN for x <= 0
*/
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_SQRTHF
,
0.707106781186547524
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p0
,
7.0376836292E-2
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p1
,
-
1.1514610310E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p2
,
1.1676998740E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p3
,
-
1.2420140846E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p4
,
+
1.4249322787E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p5
,
-
1.6668057665E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p6
,
+
2.0000714765E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p7
,
-
2.4999993993E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_p8
,
+
3.3333331174E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_q1
,
-
2.12194440e-4
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_log_q2
,
0.693359375
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
exp_hi
,
88.3762626647950
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
exp_lo
,
-
88.3762626647949
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_LOG2EF
,
1.44269504088896341
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_C1
,
0.693359375
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_C2
,
-
2.12194440e-4
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p0
,
1.9875691500E-4
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p1
,
1.3981999507E-3
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p2
,
8.3334519073E-3
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p3
,
4.1665795894E-2
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p4
,
1.6666665459E-1
f
);
static
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p5
,
5.0000001201E-1
f
);
#ifdef __VSX__
static
_EIGEN_DECLARE_CONST_Packet2d
(
1
,
1.0
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
2
,
2.0
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
half
,
0.5
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
exp_hi
,
709.437
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
exp_lo
,
-
709.436139303
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_LOG2EF
,
1.4426950408889634073599
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_p0
,
1.26177193074810590878e-4
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_p1
,
3.02994407707441961300e-2
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_p2
,
9.99999999999999999910e-1
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_q0
,
3.00198505138664455042e-6
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_q1
,
2.52448340349684104192e-3
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_q2
,
2.27265548208155028766e-1
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_q3
,
2.00000000000000000009e0
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_C1
,
0.693145751953125
);
static
_EIGEN_DECLARE_CONST_Packet2d
(
cephes_exp_C2
,
1.42860682030941723212e-6
);
#ifdef __POWER8_VECTOR__
static
Packet2l
p2l_1023
=
{
1023
,
1023
};
static
Packet2ul
p2ul_52
=
{
52
,
52
};
#endif
#endif
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
plog
<
Packet4f
>
(
const
Packet4f
&
_x
)
{
Packet4f
x
=
_x
;
Packet4i
emm0
;
/* isvalid_mask is 0 if x < 0 or x is NaN. */
Packet4ui
isvalid_mask
=
reinterpret_cast
<
Packet4ui
>
(
vec_cmpge
(
x
,
p4f_ZERO
));
Packet4ui
iszero_mask
=
reinterpret_cast
<
Packet4ui
>
(
vec_cmpeq
(
x
,
p4f_ZERO
));
x
=
pmax
(
x
,
p4f_min_norm_pos
);
/* cut off denormalized stuff */
emm0
=
vec_sr
(
reinterpret_cast
<
Packet4i
>
(
x
),
reinterpret_cast
<
Packet4ui
>
(
p4i_23
));
/* keep only the fractional part */
x
=
pand
(
x
,
p4f_inv_mant_mask
);
x
=
por
(
x
,
p4f_half
);
emm0
=
psub
(
emm0
,
p4i_0x7f
);
Packet4f
e
=
padd
(
vec_ctf
(
emm0
,
0
),
p4f_1
);
/* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
Packet4f
mask
=
reinterpret_cast
<
Packet4f
>
(
vec_cmplt
(
x
,
p4f_cephes_SQRTHF
));
Packet4f
tmp
=
pand
(
x
,
mask
);
x
=
psub
(
x
,
p4f_1
);
e
=
psub
(
e
,
pand
(
p4f_1
,
mask
));
x
=
padd
(
x
,
tmp
);
Packet4f
x2
=
pmul
(
x
,
x
);
Packet4f
x3
=
pmul
(
x2
,
x
);
Packet4f
y
,
y1
,
y2
;
y
=
pmadd
(
p4f_cephes_log_p0
,
x
,
p4f_cephes_log_p1
);
y1
=
pmadd
(
p4f_cephes_log_p3
,
x
,
p4f_cephes_log_p4
);
y2
=
pmadd
(
p4f_cephes_log_p6
,
x
,
p4f_cephes_log_p7
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_log_p2
);
y1
=
pmadd
(
y1
,
x
,
p4f_cephes_log_p5
);
y2
=
pmadd
(
y2
,
x
,
p4f_cephes_log_p8
);
y
=
pmadd
(
y
,
x3
,
y1
);
y
=
pmadd
(
y
,
x3
,
y2
);
y
=
pmul
(
y
,
x3
);
y1
=
pmul
(
e
,
p4f_cephes_log_q1
);
tmp
=
pmul
(
x2
,
p4f_half
);
y
=
padd
(
y
,
y1
);
x
=
psub
(
x
,
tmp
);
y2
=
pmul
(
e
,
p4f_cephes_log_q2
);
x
=
padd
(
x
,
y
);
x
=
padd
(
x
,
y2
);
// negative arg will be NAN, 0 will be -INF
x
=
vec_sel
(
x
,
p4f_minus_inf
,
iszero_mask
);
x
=
vec_sel
(
p4f_minus_nan
,
x
,
isvalid_mask
);
return
x
;
}
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
pexp
<
Packet4f
>
(
const
Packet4f
&
_x
)
{
Packet4f
x
=
_x
;
Packet4f
tmp
,
fx
;
Packet4i
emm0
;
// clamp x
x
=
pmax
(
pmin
(
x
,
p4f_exp_hi
),
p4f_exp_lo
);
// express exp(x) as exp(g + n*log(2))
fx
=
pmadd
(
x
,
p4f_cephes_LOG2EF
,
p4f_half
);
fx
=
pfloor
(
fx
);
tmp
=
pmul
(
fx
,
p4f_cephes_exp_C1
);
Packet4f
z
=
pmul
(
fx
,
p4f_cephes_exp_C2
);
x
=
psub
(
x
,
tmp
);
x
=
psub
(
x
,
z
);
z
=
pmul
(
x
,
x
);
Packet4f
y
=
p4f_cephes_exp_p0
;
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p1
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p2
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p3
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p4
);
y
=
pmadd
(
y
,
x
,
p4f_cephes_exp_p5
);
y
=
pmadd
(
y
,
z
,
x
);
y
=
padd
(
y
,
p4f_1
);
// build 2^n
emm0
=
vec_cts
(
fx
,
0
);
emm0
=
vec_add
(
emm0
,
p4i_0x7f
);
emm0
=
vec_sl
(
emm0
,
reinterpret_cast
<
Packet4ui
>
(
p4i_23
));
// Altivec's max & min operators just drop silent NaNs. Check NaNs in
// inputs and return them unmodified.
Packet4ui
isnumber_mask
=
reinterpret_cast
<
Packet4ui
>
(
vec_cmpeq
(
_x
,
_x
));
return
vec_sel
(
_x
,
pmax
(
pmul
(
y
,
reinterpret_cast
<
Packet4f
>
(
emm0
)),
_x
),
isnumber_mask
);
}
#ifndef EIGEN_COMP_CLANG
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
prsqrt
<
Packet4f
>
(
const
Packet4f
&
x
)
{
return
vec_rsqrt
(
x
);
}
#endif
#ifdef __VSX__
#ifndef EIGEN_COMP_CLANG
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet2d
prsqrt
<
Packet2d
>
(
const
Packet2d
&
x
)
{
return
vec_rsqrt
(
x
);
}
#endif
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
psqrt
<
Packet4f
>
(
const
Packet4f
&
x
)
{
return
vec_sqrt
(
x
);
}
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet2d
psqrt
<
Packet2d
>
(
const
Packet2d
&
x
)
{
return
vec_sqrt
(
x
);
}
// VSX support varies between different compilers and even different
// versions of the same compiler. For gcc version >= 4.9.3, we can use
// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use
// a slow version that works with older compilers.
// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
static
inline
Packet2l
ConvertToPacket2l
(
const
Packet2d
&
x
)
{
#if EIGEN_GNUC_AT_LEAST(5, 4) || \
(EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
return
vec_cts
(
x
,
0
);
// TODO: check clang version.
#else
double
tmp
[
2
];
memcpy
(
tmp
,
&
x
,
sizeof
(
tmp
));
Packet2l
l
=
{
static_cast
<
long
long
>
(
tmp
[
0
]),
static_cast
<
long
long
>
(
tmp
[
1
])
};
return
l
;
#endif
}
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet2d
pexp
<
Packet2d
>
(
const
Packet2d
&
_x
)
{
Packet2d
x
=
_x
;
Packet2d
tmp
,
fx
;
Packet2l
emm0
;
// clamp x
x
=
pmax
(
pmin
(
x
,
p2d_exp_hi
),
p2d_exp_lo
);
/* express exp(x) as exp(g + n*log(2)) */
fx
=
pmadd
(
x
,
p2d_cephes_LOG2EF
,
p2d_half
);
fx
=
pfloor
(
fx
);
tmp
=
pmul
(
fx
,
p2d_cephes_exp_C1
);
Packet2d
z
=
pmul
(
fx
,
p2d_cephes_exp_C2
);
x
=
psub
(
x
,
tmp
);
x
=
psub
(
x
,
z
);
Packet2d
x2
=
pmul
(
x
,
x
);
Packet2d
px
=
p2d_cephes_exp_p0
;
px
=
pmadd
(
px
,
x2
,
p2d_cephes_exp_p1
);
px
=
pmadd
(
px
,
x2
,
p2d_cephes_exp_p2
);
px
=
pmul
(
px
,
x
);
Packet2d
qx
=
p2d_cephes_exp_q0
;
qx
=
pmadd
(
qx
,
x2
,
p2d_cephes_exp_q1
);
qx
=
pmadd
(
qx
,
x2
,
p2d_cephes_exp_q2
);
qx
=
pmadd
(
qx
,
x2
,
p2d_cephes_exp_q3
);
x
=
pdiv
(
px
,
psub
(
qx
,
px
));
x
=
pmadd
(
p2d_2
,
x
,
p2d_1
);
// build 2^n
emm0
=
ConvertToPacket2l
(
fx
);
#ifdef __POWER8_VECTOR__
emm0
=
vec_add
(
emm0
,
p2l_1023
);
emm0
=
vec_sl
(
emm0
,
p2ul_52
);
#else
// Code is a bit complex for POWER7. There is actually a
// vec_xxsldi intrinsic but it is not supported by some gcc versions.
// So we shift (52-32) bits and do a word swap with zeros.
_EIGEN_DECLARE_CONST_Packet4i
(
1023
,
1023
);
_EIGEN_DECLARE_CONST_Packet4i
(
20
,
20
);
// 52 - 32
Packet4i
emm04i
=
reinterpret_cast
<
Packet4i
>
(
emm0
);
emm04i
=
vec_add
(
emm04i
,
p4i_1023
);
emm04i
=
vec_sl
(
emm04i
,
reinterpret_cast
<
Packet4ui
>
(
p4i_20
));
static
const
Packet16uc
perm
=
{
0x14
,
0x15
,
0x16
,
0x17
,
0x00
,
0x01
,
0x02
,
0x03
,
0x1c
,
0x1d
,
0x1e
,
0x1f
,
0x08
,
0x09
,
0x0a
,
0x0b
};
#ifdef _BIG_ENDIAN
emm0
=
reinterpret_cast
<
Packet2l
>
(
vec_perm
(
p4i_ZERO
,
emm04i
,
perm
));
#else
emm0
=
reinterpret_cast
<
Packet2l
>
(
vec_perm
(
emm04i
,
p4i_ZERO
,
perm
));
#endif
#endif
// Altivec's max & min operators just drop silent NaNs. Check NaNs in
// inputs and return them unmodified.
Packet2ul
isnumber_mask
=
reinterpret_cast
<
Packet2ul
>
(
vec_cmpeq
(
_x
,
_x
));
return
vec_sel
(
_x
,
pmax
(
pmul
(
x
,
reinterpret_cast
<
Packet2d
>
(
emm0
)),
_x
),
isnumber_mask
);
}
#endif
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_ALTIVEC_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/AltiVec/PacketMath.h
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008 Konstantinos Margaritis <markos@
codex.gr
>
// Copyright (C) 2008
-2016
Konstantinos Margaritis <markos@
freevec.org
>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
...
...
@@ -18,13 +18,17 @@ namespace internal {
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
#endif
#ifndef EIGEN_HAS_FUSE_CJMADD
#define EIGEN_HAS_FUSE_CJMADD 1
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#endif
// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
16
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
32
#endif
typedef
__vector
float
Packet4f
;
...
...
@@ -38,7 +42,7 @@ typedef __vector unsigned char Packet16uc;
// and it doesn't really work to declare them global, so we define macros instead
#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
Packet4f p4f_##NAME =
(
Packet4f
)
vec_splat_s32(X)
Packet4f p4f_##NAME =
reinterpret_cast<
Packet4f
>(
vec_splat_s32(X)
)
#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
Packet4i p4i_##NAME = vec_splat_s32(X)
...
...
@@ -46,60 +50,158 @@ typedef __vector unsigned char Packet16uc;
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
Packet4f p4f_##NAME = pset1<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
Packet4i p4i_##NAME = pset1<Packet4i>(X)
#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
Packet2d p2d_##NAME = pset1<Packet2d>(X)
#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
Packet2l p2l_##NAME = pset1<Packet2l>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
#define DST_CHAN 1
#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
// These constants are endian-agnostic
static
_EIGEN_DECLARE_CONST_FAST_Packet4f
(
ZERO
,
0
);
//{ 0.0, 0.0, 0.0, 0.0}
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
ZERO
,
0
);
//{ 0, 0, 0, 0,}
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
ONE
,
1
);
//{ 1, 1, 1, 1}
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
MINUS16
,
-
16
);
//{ -16, -16, -16, -16}
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
MINUS1
,
-
1
);
//{ -1, -1, -1, -1}
static
Packet4f
p4f_MZERO
=
(
Packet4f
)
vec_sl
((
Packet4ui
)
p4i_MINUS1
,
(
Packet4ui
)
p4i_MINUS1
);
//{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
#ifndef __VSX__
static
Packet4f
p4f_ONE
=
vec_ctf
(
p4i_ONE
,
0
);
//{ 1.0, 1.0, 1.0, 1.0}
#endif
static
Packet4f
p4f_COUNTDOWN
=
{
0.0
,
1.0
,
2.0
,
3.0
};
static
Packet4i
p4i_COUNTDOWN
=
{
0
,
1
,
2
,
3
};
static
Packet16uc
p16uc_REVERSE32
=
{
12
,
13
,
14
,
15
,
8
,
9
,
10
,
11
,
4
,
5
,
6
,
7
,
0
,
1
,
2
,
3
};
static
Packet16uc
p16uc_DUPLICATE32_HI
=
{
0
,
1
,
2
,
3
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
4
,
5
,
6
,
7
};
// Mask alignment
#ifdef __PPC64__
#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
#else
#define _EIGEN_MASK_ALIGNMENT 0xfffffff0
#endif
#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
// Handle endianness properly while loading constants
// Define global static constants:
static
Packet4f
p4f_COUNTDOWN
=
{
3.0
,
2.0
,
1.0
,
0.0
};
static
Packet4i
p4i_COUNTDOWN
=
{
3
,
2
,
1
,
0
};
static
Packet16uc
p16uc_REVERSE
=
{
12
,
13
,
14
,
15
,
8
,
9
,
10
,
11
,
4
,
5
,
6
,
7
,
0
,
1
,
2
,
3
};
#ifdef _BIG_ENDIAN
static
Packet16uc
p16uc_FORWARD
=
vec_lvsl
(
0
,
(
float
*
)
0
);
static
Packet16uc
p16uc_DUPLICATE
=
{
0
,
1
,
2
,
3
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
4
,
5
,
6
,
7
};
static
_EIGEN_DECLARE_CONST_FAST_Packet4f
(
ZERO
,
0
);
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
ZERO
,
0
);
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
ONE
,
1
);
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
MINUS16
,
-
16
);
static
_EIGEN_DECLARE_CONST_FAST_Packet4i
(
MINUS1
,
-
1
);
static
Packet4f
p4f_ONE
=
vec_ctf
(
p4i_ONE
,
0
);
static
Packet4f
p4f_ZERO_
=
(
Packet4f
)
vec_sl
((
Packet4ui
)
p4i_MINUS1
,
(
Packet4ui
)
p4i_MINUS1
);
#ifdef __VSX__
static
Packet16uc
p16uc_REVERSE64
=
{
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
#endif
static
Packet16uc
p16uc_PSET32_WODD
=
vec_sld
((
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
0
),
(
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
2
),
8
);
//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
static
Packet16uc
p16uc_PSET32_WEVEN
=
vec_sld
(
p16uc_DUPLICATE32_HI
,
(
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
3
),
8
);
//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
static
Packet16uc
p16uc_HALF64_0_16
=
vec_sld
((
Packet16uc
)
p4i_ZERO
,
vec_splat
((
Packet16uc
)
vec_abs
(
p4i_MINUS16
),
3
),
8
);
//{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
#else
static
Packet16uc
p16uc_FORWARD
=
p16uc_REVERSE32
;
static
Packet16uc
p16uc_REVERSE64
=
{
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
static
Packet16uc
p16uc_PSET32_WODD
=
vec_sld
((
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
1
),
(
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
3
),
8
);
//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
static
Packet16uc
p16uc_PSET32_WEVEN
=
vec_sld
((
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
0
),
(
Packet16uc
)
vec_splat
((
Packet4ui
)
p16uc_FORWARD
,
2
),
8
);
//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
static
Packet16uc
p16uc_HALF64_0_16
=
vec_sld
(
vec_splat
((
Packet16uc
)
vec_abs
(
p4i_MINUS16
),
0
),
(
Packet16uc
)
p4i_ZERO
,
8
);
//{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
#endif // _BIG_ENDIAN
static
Packet16uc
p16uc_PSET64_HI
=
(
Packet16uc
)
vec_mergeh
((
Packet4ui
)
p16uc_PSET32_WODD
,
(
Packet4ui
)
p16uc_PSET32_WEVEN
);
//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
static
Packet16uc
p16uc_PSET64_LO
=
(
Packet16uc
)
vec_mergel
((
Packet4ui
)
p16uc_PSET32_WODD
,
(
Packet4ui
)
p16uc_PSET32_WEVEN
);
//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
static
Packet16uc
p16uc_TRANSPOSE64_HI
=
p16uc_PSET64_HI
+
p16uc_HALF64_0_16
;
//{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
static
Packet16uc
p16uc_TRANSPOSE64_LO
=
p16uc_PSET64_LO
+
p16uc_HALF64_0_16
;
//{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
static
Packet16uc
p16uc_COMPLEX32_REV
=
vec_sld
(
p16uc_REVERSE32
,
p16uc_REVERSE32
,
8
);
//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
#ifdef _BIG_ENDIAN
static
Packet16uc
p16uc_COMPLEX32_REV2
=
vec_sld
(
p16uc_FORWARD
,
p16uc_FORWARD
,
8
);
//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
#else
static
Packet16uc
p16uc_COMPLEX32_REV2
=
vec_sld
(
p16uc_PSET64_HI
,
p16uc_PSET64_LO
,
8
);
//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
#endif // _BIG_ENDIAN
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
#define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
#else
#define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
#endif
template
<
>
struct
packet_traits
<
float
>
:
default_packet_traits
{
typedef
Packet4f
type
;
typedef
Packet4f
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
,
// FIXME check the Has*
HasHalfPacket
=
1
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasMin
=
1
,
HasMax
=
1
,
HasAbs
=
1
,
HasSin
=
0
,
HasCos
=
0
,
HasLog
=
0
,
HasExp
=
0
,
HasSqrt
=
0
HasExp
=
1
,
#ifdef __VSX__
HasSqrt
=
1
,
#if !EIGEN_COMP_CLANG
HasRsqrt
=
1
,
#else
HasRsqrt
=
0
,
#endif
#else
HasSqrt
=
0
,
HasRsqrt
=
0
,
#endif
HasRound
=
1
,
HasFloor
=
1
,
HasCeil
=
1
,
HasNegate
=
1
,
HasBlend
=
1
};
};
template
<
>
struct
packet_traits
<
int
>
:
default_packet_traits
{
typedef
Packet4i
type
;
typedef
Packet4i
half
;
enum
{
// FIXME check the Has*
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
size
=
4
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
0
,
HasBlend
=
1
};
};
template
<
>
struct
unpacket_traits
<
Packet4f
>
{
typedef
float
type
;
enum
{
size
=
4
};
};
template
<
>
struct
unpacket_traits
<
Packet4i
>
{
typedef
int
type
;
enum
{
size
=
4
};
};
/*
template
<
>
struct
unpacket_traits
<
Packet4f
>
{
typedef
float
type
;
enum
{
size
=
4
,
alignment
=
Aligned16
};
typedef
Packet4f
half
;
};
template
<
>
struct
unpacket_traits
<
Packet4i
>
{
typedef
int
type
;
enum
{
size
=
4
,
alignment
=
Aligned16
};
typedef
Packet4i
half
;
};
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
Packet16uc
&
v
)
{
union
{
Packet16uc
v
;
unsigned
char
n
[
16
];
}
vt
;
vt
.
v
=
v
;
for
(
int
i
=
0
;
i
<
16
;
i
++
)
s
<<
(
int
)
vt
.
n
[
i
]
<<
", "
;
return
s
;
}
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
Packet4f
&
v
)
{
union
{
...
...
@@ -133,86 +235,136 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
return
s
;
}
inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
// Need to define them first or we get specialization after instantiation errors
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pload
<
Packet4f
>
(
const
float
*
from
)
{
union {
Packet4bi v;
unsigned int n[4];
} vt;
vt.v = v;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
return s;
}
*/
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pset1
<
Packet4f
>
(
const
float
&
from
)
{
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
float
EIGEN_ALIGN16
af
[
4
];
af
[
0
]
=
from
;
Packet4f
vc
=
vec_ld
(
0
,
af
);
vc
=
vec_splat
(
vc
,
0
);
return
vc
;
EIGEN_DEBUG_ALIGNED_LOAD
#ifdef __VSX__
return
vec_vsx_ld
(
0
,
from
);
#else
return
vec_ld
(
0
,
from
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pset1
<
Packet4i
>
(
const
int
&
from
)
{
int
EIGEN_ALIGN16
ai
[
4
];
ai
[
0
]
=
from
;
Packet4i
vc
=
vec_ld
(
0
,
ai
);
vc
=
vec_splat
(
vc
,
0
);
return
vc
;
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pload
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
#ifdef __VSX__
return
vec_vsx_ld
(
0
,
from
);
#else
return
vec_ld
(
0
,
from
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
plset
<
float
>
(
const
float
&
a
)
{
return
vec_add
(
pset1
<
Packet4f
>
(
a
),
p4f_COUNTDOWN
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
plset
<
int
>
(
const
int
&
a
)
{
return
vec_add
(
pset1
<
Packet4i
>
(
a
),
p4i_COUNTDOWN
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
#ifdef __VSX__
vec_vsx_st
(
from
,
0
,
to
);
#else
vec_st
(
from
,
0
,
to
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
padd
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_add
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
padd
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_add
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
#ifdef __VSX__
vec_vsx_st
(
from
,
0
,
to
);
#else
vec_st
(
from
,
0
,
to
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
psub
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_sub
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
psub
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_sub
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pset1
<
Packet4f
>
(
const
float
&
from
)
{
Packet4f
v
=
{
from
,
from
,
from
,
from
};
return
v
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pnegate
(
const
Packet4f
&
a
)
{
return
psub
<
Packet4f
>
(
p4f_ZERO
,
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pnegate
(
const
Packet4i
&
a
)
{
return
psub
<
Packet4i
>
(
p4i_ZERO
,
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pset1
<
Packet4i
>
(
const
int
&
from
)
{
Packet4i
v
=
{
from
,
from
,
from
,
from
};
return
v
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pbroadcast4
<
Packet4f
>
(
const
float
*
a
,
Packet4f
&
a0
,
Packet4f
&
a1
,
Packet4f
&
a2
,
Packet4f
&
a3
)
{
a3
=
pload
<
Packet4f
>
(
a
);
a0
=
vec_splat
(
a3
,
0
);
a1
=
vec_splat
(
a3
,
1
);
a2
=
vec_splat
(
a3
,
2
);
a3
=
vec_splat
(
a3
,
3
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pbroadcast4
<
Packet4i
>
(
const
int
*
a
,
Packet4i
&
a0
,
Packet4i
&
a1
,
Packet4i
&
a2
,
Packet4i
&
a3
)
{
a3
=
pload
<
Packet4i
>
(
a
);
a0
=
vec_splat
(
a3
,
0
);
a1
=
vec_splat
(
a3
,
1
);
a2
=
vec_splat
(
a3
,
2
);
a3
=
vec_splat
(
a3
,
3
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmul
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_madd
(
a
,
b
,
p4f_ZERO
);
}
/* Commented out: it's actually slower than processing it scalar
*
template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet4f
pgather
<
float
,
Packet4f
>
(
const
float
*
from
,
Index
stride
)
{
float
EIGEN_ALIGN16
af
[
4
];
af
[
0
]
=
from
[
0
*
stride
];
af
[
1
]
=
from
[
1
*
stride
];
af
[
2
]
=
from
[
2
*
stride
];
af
[
3
]
=
from
[
3
*
stride
];
return
pload
<
Packet4f
>
(
af
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet4i
pgather
<
int
,
Packet4i
>
(
const
int
*
from
,
Index
stride
)
{
// Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
//Set up constants, variables
Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
int
EIGEN_ALIGN16
ai
[
4
];
ai
[
0
]
=
from
[
0
*
stride
];
ai
[
1
]
=
from
[
1
*
stride
];
ai
[
2
]
=
from
[
2
*
stride
];
ai
[
3
]
=
from
[
3
*
stride
];
return
pload
<
Packet4i
>
(
ai
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
float
,
Packet4f
>
(
float
*
to
,
const
Packet4f
&
from
,
Index
stride
)
{
float
EIGEN_ALIGN16
af
[
4
];
pstore
<
float
>
(
af
,
from
);
to
[
0
*
stride
]
=
af
[
0
];
to
[
1
*
stride
]
=
af
[
1
];
to
[
2
*
stride
]
=
af
[
2
];
to
[
3
*
stride
]
=
af
[
3
];
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
int
,
Packet4i
>
(
int
*
to
,
const
Packet4i
&
from
,
Index
stride
)
{
int
EIGEN_ALIGN16
ai
[
4
];
pstore
<
int
>
((
int
*
)
ai
,
from
);
to
[
0
*
stride
]
=
ai
[
0
];
to
[
1
*
stride
]
=
ai
[
1
];
to
[
2
*
stride
]
=
ai
[
2
];
to
[
3
*
stride
]
=
ai
[
3
];
}
// Get the absolute values
a1 = vec_abs(a);
b1 = vec_abs(b);
template
<
>
EIGEN_STRONG_INLINE
Packet4f
plset
<
Packet4f
>
(
const
float
&
a
)
{
return
pset1
<
Packet4f
>
(
a
)
+
p4f_COUNTDOWN
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
plset
<
Packet4i
>
(
const
int
&
a
)
{
return
pset1
<
Packet4i
>
(
a
)
+
p4i_COUNTDOWN
;
}
// Get the signs using xor
Packet4
b
i
sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO);
template
<
>
EIGEN_STRONG_INLINE
Packet4f
padd
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
a
+
b
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
padd
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
a
+
b
;
}
// Do the multiplication for the asbolute values.
bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 );
low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1);
high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO);
high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16);
prod = vec_add( low_prod, high_prod );
template
<
>
EIGEN_STRONG_INLINE
Packet4f
psub
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
a
-
b
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
psub
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
a
-
b
;
}
// NOR the product and select only the negative elements according to the sign mask
prod_ = vec_nor(prod, prod);
prod_ = vec_sel(p4i_ZERO, prod_, sgn);
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pnegate
(
const
Packet4f
&
a
)
{
return
p4f_ZERO
-
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pnegate
(
const
Packet4i
&
a
)
{
return
p4i_ZERO
-
a
;
}
// Add 1 to the result to get the negative numbers
v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn);
prod_ = vec_add(prod_, v1sel);
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pconj
(
const
Packet4f
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pconj
(
const
Packet4i
&
a
)
{
return
a
;
}
// Merge the results back to the final vector.
prod = vec_sel(prod, prod_, sgn);
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmul
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_madd
(
a
,
b
,
p4f_MZERO
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmul
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
a
*
b
;
}
return prod;
}
*/
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pdiv
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
Packet4f
t
,
y_0
,
y_1
,
res
;
#ifndef __VSX__ // VSX actually provides a div instruction
Packet4f
t
,
y_0
,
y_1
;
// Altivec does not offer a divide instruction, we have to do a reciprocal approximation
y_0
=
vec_re
(
b
);
...
...
@@ -221,8 +373,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
t
=
vec_nmsub
(
y_0
,
b
,
p4f_ONE
);
y_1
=
vec_madd
(
y_0
,
t
,
y_0
);
res
=
vec_madd
(
a
,
y_1
,
p4f_ZERO
);
return
res
;
return
vec_madd
(
a
,
y_1
,
p4f_MZERO
);
#else
return
vec_div
(
a
,
b
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pdiv
<
Packet4i
>
(
const
Packet4i
&
/*a*/
,
const
Packet4i
&
/*b*/
)
...
...
@@ -231,16 +385,33 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
}
// for some weird raisons, it has to be overloaded for packet of integers
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmadd
(
const
Packet4f
&
a
,
const
Packet4f
&
b
,
const
Packet4f
&
c
)
{
return
vec_madd
(
a
,
b
,
c
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmadd
(
const
Packet4i
&
a
,
const
Packet4i
&
b
,
const
Packet4i
&
c
)
{
return
padd
(
pmul
(
a
,
b
),
c
)
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmadd
(
const
Packet4f
&
a
,
const
Packet4f
&
b
,
const
Packet4f
&
c
)
{
return
vec_madd
(
a
,
b
,
c
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmadd
(
const
Packet4i
&
a
,
const
Packet4i
&
b
,
const
Packet4i
&
c
)
{
return
a
*
b
+
c
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmin
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_min
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmin
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
#ifdef __VSX__
Packet4f
ret
;
__asm__
(
"xvcmpgesp %x0,%x1,%x2
\n\t
xxsel %x0,%x1,%x2,%x0"
:
"=&wa"
(
ret
)
:
"wa"
(
a
),
"wa"
(
b
));
return
ret
;
#else
return
vec_min
(
a
,
b
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmin
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_min
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmax
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_max
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmax
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
#ifdef __VSX__
Packet4f
ret
;
__asm__
(
"xvcmpgtsp %x0,%x2,%x1
\n\t
xxsel %x0,%x1,%x2,%x0"
:
"=&wa"
(
ret
)
:
"wa"
(
a
),
"wa"
(
b
));
return
ret
;
#else
return
vec_max
(
a
,
b
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmax
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_max
(
a
,
b
);
}
// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pand
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_and
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pand
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_and
(
a
,
b
);
}
...
...
@@ -253,13 +424,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pandnot
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vec_and
(
a
,
vec_nor
(
b
,
b
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pandnot
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vec_and
(
a
,
vec_nor
(
b
,
b
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pload
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vec_ld
(
0
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pload
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vec_ld
(
0
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pround
<
Packet4f
>
(
const
Packet4f
&
a
)
{
return
vec_round
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pceil
<
Packet4f
>
(
const
Packet4f
&
a
)
{
return
vec_ceil
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pfloor
<
Packet4f
>
(
const
Packet4f
&
a
)
{
return
vec_floor
(
a
);
}
#ifdef _BIG_ENDIAN
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploadu
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
Packet16uc
MSQ
,
LSQ
;
Packet16uc
mask
;
MSQ
=
vec_ld
(
0
,
(
unsigned
char
*
)
from
);
// most significant quadword
...
...
@@ -279,25 +451,36 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
mask
=
vec_lvsl
(
0
,
from
);
// create the permute mask
return
(
Packet4i
)
vec_perm
(
MSQ
,
LSQ
,
mask
);
// align the data
}
#else
// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploadu
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
(
Packet4i
)
vec_vsx_ld
((
long
)
from
&
15
,
(
const
int
*
)
_EIGEN_ALIGNED_PTR
(
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploadu
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
(
Packet4f
)
vec_vsx_ld
((
long
)
from
&
15
,
(
const
float
*
)
_EIGEN_ALIGNED_PTR
(
from
));
}
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploaddup
<
Packet4f
>
(
const
float
*
from
)
{
Packet4f
p
;
if
((
ptrdiff_t
(
&
from
)
%
16
)
==
0
)
p
=
pload
<
Packet4f
>
(
from
);
if
((
std
::
ptrdiff_t
(
from
)
%
16
)
==
0
)
p
=
pload
<
Packet4f
>
(
from
);
else
p
=
ploadu
<
Packet4f
>
(
from
);
return
vec_perm
(
p
,
p
,
p16uc_DUPLICATE
);
return
vec_perm
(
p
,
p
,
p16uc_DUPLICATE
32_HI
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploaddup
<
Packet4i
>
(
const
int
*
from
)
{
Packet4i
p
;
if
((
ptrdiff_t
(
&
from
)
%
16
)
==
0
)
p
=
pload
<
Packet4i
>
(
from
);
if
((
std
::
ptrdiff_t
(
from
)
%
16
)
==
0
)
p
=
pload
<
Packet4i
>
(
from
);
else
p
=
ploadu
<
Packet4i
>
(
from
);
return
vec_perm
(
p
,
p
,
p16uc_DUPLICATE
);
return
vec_perm
(
p
,
p
,
p16uc_DUPLICATE
32_HI
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vec_st
(
from
,
0
,
to
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vec_st
(
from
,
0
,
to
);
}
#ifdef _BIG_ENDIAN
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
...
...
@@ -334,15 +517,33 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& f
vec_st
(
LSQ
,
15
,
(
unsigned
char
*
)
to
);
// Store the LSQ part first
vec_st
(
MSQ
,
0
,
(
unsigned
char
*
)
to
);
// Store the MSQ part
}
#else
// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vec_vsx_st
(
from
,
(
long
)
to
&
15
,
(
int
*
)
_EIGEN_ALIGNED_PTR
(
to
));
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vec_vsx_st
(
from
,
(
long
)
to
&
15
,
(
float
*
)
_EIGEN_ALIGNED_PTR
(
to
));
}
#endif
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
float
>
(
const
float
*
addr
)
{
vec_dstt
(
addr
,
DST_CTRL
(
2
,
2
,
32
),
DST_CHAN
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
int
>
(
const
int
*
addr
)
{
vec_dstt
(
addr
,
DST_CTRL
(
2
,
2
,
32
),
DST_CHAN
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
float
>
(
const
float
*
addr
)
{
EIGEN_PPC_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
int
>
(
const
int
*
addr
)
{
EIGEN_PPC_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float
EIGEN_ALIGN16
x
[
4
]
;
vec_st
(
a
,
0
,
x
);
return
x
[
0
]
;
}
template
<
>
EIGEN_STRONG_INLINE
int
pfirst
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int
EIGEN_ALIGN16
x
[
4
]
;
vec_st
(
a
,
0
,
x
);
return
x
[
0
]
;
}
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float
EIGEN_ALIGN16
x
;
vec_st
e
(
a
,
0
,
&
x
);
return
x
;
}
template
<
>
EIGEN_STRONG_INLINE
int
pfirst
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int
EIGEN_ALIGN16
x
;
vec_st
e
(
a
,
0
,
&
x
);
return
x
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
preverse
(
const
Packet4f
&
a
)
{
return
(
Packet4f
)
vec_perm
((
Packet16uc
)
a
,(
Packet16uc
)
a
,
p16uc_REVERSE
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
preverse
(
const
Packet4i
&
a
)
{
return
(
Packet4i
)
vec_perm
((
Packet16uc
)
a
,(
Packet16uc
)
a
,
p16uc_REVERSE
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
preverse
(
const
Packet4f
&
a
)
{
return
reinterpret_cast
<
Packet4f
>
(
vec_perm
(
reinterpret_cast
<
Packet16uc
>
(
a
),
reinterpret_cast
<
Packet16uc
>
(
a
),
p16uc_REVERSE32
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
preverse
(
const
Packet4i
&
a
)
{
return
reinterpret_cast
<
Packet4i
>
(
vec_perm
(
reinterpret_cast
<
Packet16uc
>
(
a
),
reinterpret_cast
<
Packet16uc
>
(
a
),
p16uc_REVERSE32
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pabs
(
const
Packet4f
&
a
)
{
return
vec_abs
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pabs
(
const
Packet4i
&
a
)
{
return
vec_abs
(
a
);
}
...
...
@@ -350,10 +551,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs
template
<
>
EIGEN_STRONG_INLINE
float
predux
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet4f
b
,
sum
;
b
=
(
Packet4f
)
vec_sld
(
a
,
a
,
8
);
sum
=
vec_add
(
a
,
b
)
;
b
=
(
Packet4f
)
vec_sld
(
sum
,
sum
,
4
);
sum
=
vec_add
(
sum
,
b
)
;
b
=
vec_sld
(
a
,
a
,
8
);
sum
=
a
+
b
;
b
=
vec_sld
(
sum
,
sum
,
4
);
sum
+
=
b
;
return
pfirst
(
sum
);
}
...
...
@@ -376,11 +577,11 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
// Now do the summation:
// Lines 0+1
sum
[
0
]
=
vec_add
(
sum
[
0
]
,
sum
[
1
]
)
;
sum
[
0
]
=
sum
[
0
]
+
sum
[
1
];
// Lines 2+3
sum
[
1
]
=
vec_add
(
sum
[
2
]
,
sum
[
3
]
)
;
sum
[
1
]
=
sum
[
2
]
+
sum
[
3
];
// Add the results
sum
[
0
]
=
vec_add
(
sum
[
0
]
,
sum
[
1
]
)
;
sum
[
0
]
=
sum
[
0
]
+
sum
[
1
];
return
sum
[
0
];
}
...
...
@@ -389,7 +590,11 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
{
Packet4i
sum
;
sum
=
vec_sums
(
a
,
p4i_ZERO
);
#ifdef _BIG_ENDIAN
sum
=
vec_sld
(
sum
,
p4i_ZERO
,
12
);
#else
sum
=
vec_sld
(
p4i_ZERO
,
sum
,
4
);
#endif
return
pfirst
(
sum
);
}
...
...
@@ -412,11 +617,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
// Now do the summation:
// Lines 0+1
sum
[
0
]
=
vec_add
(
sum
[
0
]
,
sum
[
1
]
)
;
sum
[
0
]
=
sum
[
0
]
+
sum
[
1
];
// Lines 2+3
sum
[
1
]
=
vec_add
(
sum
[
2
]
,
sum
[
3
]
)
;
sum
[
1
]
=
sum
[
2
]
+
sum
[
3
];
// Add the results
sum
[
0
]
=
vec_add
(
sum
[
0
]
,
sum
[
1
]
)
;
sum
[
0
]
=
sum
[
0
]
+
sum
[
1
];
return
sum
[
0
];
}
...
...
@@ -426,8 +631,8 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
template
<
>
EIGEN_STRONG_INLINE
float
predux_mul
<
Packet4f
>
(
const
Packet4f
&
a
)
{
Packet4f
prod
;
prod
=
pmul
(
a
,
(
Packet4f
)
vec_sld
(
a
,
a
,
8
));
return
pfirst
(
pmul
(
prod
,
(
Packet4f
)
vec_sld
(
prod
,
prod
,
4
)));
prod
=
pmul
(
a
,
vec_sld
(
a
,
a
,
8
));
return
pfirst
(
pmul
(
prod
,
vec_sld
(
prod
,
prod
,
4
)));
}
template
<
>
EIGEN_STRONG_INLINE
int
predux_mul
<
Packet4i
>
(
const
Packet4i
&
a
)
...
...
@@ -476,8 +681,25 @@ struct palign_impl<Offset,Packet4f>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet4f
&
first
,
const
Packet4f
&
second
)
{
if
(
Offset
!=
0
)
first
=
vec_sld
(
first
,
second
,
Offset
*
4
);
#ifdef _BIG_ENDIAN
switch
(
Offset
%
4
)
{
case
1
:
first
=
vec_sld
(
first
,
second
,
4
);
break
;
case
2
:
first
=
vec_sld
(
first
,
second
,
8
);
break
;
case
3
:
first
=
vec_sld
(
first
,
second
,
12
);
break
;
}
#else
switch
(
Offset
%
4
)
{
case
1
:
first
=
vec_sld
(
second
,
first
,
12
);
break
;
case
2
:
first
=
vec_sld
(
second
,
first
,
8
);
break
;
case
3
:
first
=
vec_sld
(
second
,
first
,
4
);
break
;
}
#endif
}
};
...
...
@@ -486,11 +708,352 @@ struct palign_impl<Offset,Packet4i>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet4i
&
first
,
const
Packet4i
&
second
)
{
if
(
Offset
!=
0
)
first
=
vec_sld
(
first
,
second
,
Offset
*
4
);
#ifdef _BIG_ENDIAN
switch
(
Offset
%
4
)
{
case
1
:
first
=
vec_sld
(
first
,
second
,
4
);
break
;
case
2
:
first
=
vec_sld
(
first
,
second
,
8
);
break
;
case
3
:
first
=
vec_sld
(
first
,
second
,
12
);
break
;
}
#else
switch
(
Offset
%
4
)
{
case
1
:
first
=
vec_sld
(
second
,
first
,
12
);
break
;
case
2
:
first
=
vec_sld
(
second
,
first
,
8
);
break
;
case
3
:
first
=
vec_sld
(
second
,
first
,
4
);
break
;
}
#endif
}
};
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet4f
,
4
>&
kernel
)
{
Packet4f
t0
,
t1
,
t2
,
t3
;
t0
=
vec_mergeh
(
kernel
.
packet
[
0
],
kernel
.
packet
[
2
]);
t1
=
vec_mergel
(
kernel
.
packet
[
0
],
kernel
.
packet
[
2
]);
t2
=
vec_mergeh
(
kernel
.
packet
[
1
],
kernel
.
packet
[
3
]);
t3
=
vec_mergel
(
kernel
.
packet
[
1
],
kernel
.
packet
[
3
]);
kernel
.
packet
[
0
]
=
vec_mergeh
(
t0
,
t2
);
kernel
.
packet
[
1
]
=
vec_mergel
(
t0
,
t2
);
kernel
.
packet
[
2
]
=
vec_mergeh
(
t1
,
t3
);
kernel
.
packet
[
3
]
=
vec_mergel
(
t1
,
t3
);
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet4i
,
4
>&
kernel
)
{
Packet4i
t0
,
t1
,
t2
,
t3
;
t0
=
vec_mergeh
(
kernel
.
packet
[
0
],
kernel
.
packet
[
2
]);
t1
=
vec_mergel
(
kernel
.
packet
[
0
],
kernel
.
packet
[
2
]);
t2
=
vec_mergeh
(
kernel
.
packet
[
1
],
kernel
.
packet
[
3
]);
t3
=
vec_mergel
(
kernel
.
packet
[
1
],
kernel
.
packet
[
3
]);
kernel
.
packet
[
0
]
=
vec_mergeh
(
t0
,
t2
);
kernel
.
packet
[
1
]
=
vec_mergel
(
t0
,
t2
);
kernel
.
packet
[
2
]
=
vec_mergeh
(
t1
,
t3
);
kernel
.
packet
[
3
]
=
vec_mergel
(
t1
,
t3
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pblend
(
const
Selector
<
4
>&
ifPacket
,
const
Packet4i
&
thenPacket
,
const
Packet4i
&
elsePacket
)
{
Packet4ui
select
=
{
ifPacket
.
select
[
0
],
ifPacket
.
select
[
1
],
ifPacket
.
select
[
2
],
ifPacket
.
select
[
3
]
};
Packet4ui
mask
=
reinterpret_cast
<
Packet4ui
>
(
vec_cmpeq
(
reinterpret_cast
<
Packet4ui
>
(
select
),
reinterpret_cast
<
Packet4ui
>
(
p4i_ONE
)));
return
vec_sel
(
elsePacket
,
thenPacket
,
mask
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pblend
(
const
Selector
<
4
>&
ifPacket
,
const
Packet4f
&
thenPacket
,
const
Packet4f
&
elsePacket
)
{
Packet4ui
select
=
{
ifPacket
.
select
[
0
],
ifPacket
.
select
[
1
],
ifPacket
.
select
[
2
],
ifPacket
.
select
[
3
]
};
Packet4ui
mask
=
reinterpret_cast
<
Packet4ui
>
(
vec_cmpeq
(
reinterpret_cast
<
Packet4ui
>
(
select
),
reinterpret_cast
<
Packet4ui
>
(
p4i_ONE
)));
return
vec_sel
(
elsePacket
,
thenPacket
,
mask
);
}
//---------- double ----------
#ifdef __VSX__
typedef
__vector
double
Packet2d
;
typedef
__vector
unsigned
long
long
Packet2ul
;
typedef
__vector
long
long
Packet2l
;
#if EIGEN_COMP_CLANG
typedef
Packet2ul
Packet2bl
;
#else
typedef
__vector
__bool
long
Packet2bl
;
#endif
static
Packet2l
p2l_ONE
=
{
1
,
1
};
static
Packet2l
p2l_ZERO
=
reinterpret_cast
<
Packet2l
>
(
p4i_ZERO
);
static
Packet2d
p2d_ONE
=
{
1.0
,
1.0
};
static
Packet2d
p2d_ZERO
=
reinterpret_cast
<
Packet2d
>
(
p4f_ZERO
);
static
Packet2d
p2d_MZERO
=
{
-
0.0
,
-
0.0
};
#ifdef _BIG_ENDIAN
static
Packet2d
p2d_COUNTDOWN
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4f
>
(
p2d_ZERO
),
reinterpret_cast
<
Packet4f
>
(
p2d_ONE
),
8
));
#else
static
Packet2d
p2d_COUNTDOWN
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4f
>
(
p2d_ONE
),
reinterpret_cast
<
Packet4f
>
(
p2d_ZERO
),
8
));
#endif
template
<
int
index
>
Packet2d
vec_splat_dbl
(
Packet2d
&
a
);
template
<
>
EIGEN_STRONG_INLINE
Packet2d
vec_splat_dbl
<
0
>
(
Packet2d
&
a
)
{
return
reinterpret_cast
<
Packet2d
>
(
vec_perm
(
a
,
a
,
p16uc_PSET64_HI
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
vec_splat_dbl
<
1
>
(
Packet2d
&
a
)
{
return
reinterpret_cast
<
Packet2d
>
(
vec_perm
(
a
,
a
,
p16uc_PSET64_LO
));
}
template
<
>
struct
packet_traits
<
double
>
:
default_packet_traits
{
typedef
Packet2d
type
;
typedef
Packet2d
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
1
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasMin
=
1
,
HasMax
=
1
,
HasAbs
=
1
,
HasSin
=
0
,
HasCos
=
0
,
HasLog
=
0
,
HasExp
=
1
,
HasSqrt
=
1
,
HasRsqrt
=
1
,
HasRound
=
1
,
HasFloor
=
1
,
HasCeil
=
1
,
HasNegate
=
1
,
HasBlend
=
1
};
};
template
<
>
struct
unpacket_traits
<
Packet2d
>
{
typedef
double
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
Packet2d
half
;
};
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
Packet2l
&
v
)
{
union
{
Packet2l
v
;
int64_t
n
[
2
];
}
vt
;
vt
.
v
=
v
;
s
<<
vt
.
n
[
0
]
<<
", "
<<
vt
.
n
[
1
];
return
s
;
}
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
Packet2d
&
v
)
{
union
{
Packet2d
v
;
double
n
[
2
];
}
vt
;
vt
.
v
=
v
;
s
<<
vt
.
n
[
0
]
<<
", "
<<
vt
.
n
[
1
];
return
s
;
}
// Need to define them first or we get specialization after instantiation errors
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pload
<
Packet2d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
#ifdef __VSX__
return
vec_vsx_ld
(
0
,
from
);
#else
return
vec_ld
(
0
,
from
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
double
>
(
double
*
to
,
const
Packet2d
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
#ifdef __VSX__
vec_vsx_st
(
from
,
0
,
to
);
#else
vec_st
(
from
,
0
,
to
);
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pset1
<
Packet2d
>
(
const
double
&
from
)
{
Packet2d
v
=
{
from
,
from
};
return
v
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pbroadcast4
<
Packet2d
>
(
const
double
*
a
,
Packet2d
&
a0
,
Packet2d
&
a1
,
Packet2d
&
a2
,
Packet2d
&
a3
)
{
a1
=
pload
<
Packet2d
>
(
a
);
a0
=
vec_splat_dbl
<
0
>
(
a1
);
a1
=
vec_splat_dbl
<
1
>
(
a1
);
a3
=
pload
<
Packet2d
>
(
a
+
2
);
a2
=
vec_splat_dbl
<
0
>
(
a3
);
a3
=
vec_splat_dbl
<
1
>
(
a3
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet2d
pgather
<
double
,
Packet2d
>
(
const
double
*
from
,
Index
stride
)
{
double
EIGEN_ALIGN16
af
[
2
];
af
[
0
]
=
from
[
0
*
stride
];
af
[
1
]
=
from
[
1
*
stride
];
return
pload
<
Packet2d
>
(
af
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
double
,
Packet2d
>
(
double
*
to
,
const
Packet2d
&
from
,
Index
stride
)
{
double
EIGEN_ALIGN16
af
[
2
];
pstore
<
double
>
(
af
,
from
);
to
[
0
*
stride
]
=
af
[
0
];
to
[
1
*
stride
]
=
af
[
1
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
plset
<
Packet2d
>
(
const
double
&
a
)
{
return
pset1
<
Packet2d
>
(
a
)
+
p2d_COUNTDOWN
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
padd
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
a
+
b
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
psub
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
a
-
b
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pnegate
(
const
Packet2d
&
a
)
{
return
p2d_ZERO
-
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pconj
(
const
Packet2d
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmul
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_madd
(
a
,
b
,
p2d_MZERO
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pdiv
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_div
(
a
,
b
);
}
// for some weird raisons, it has to be overloaded for packet of integers
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmadd
(
const
Packet2d
&
a
,
const
Packet2d
&
b
,
const
Packet2d
&
c
)
{
return
vec_madd
(
a
,
b
,
c
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmin
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
Packet2d
ret
;
__asm__
(
"xvcmpgedp %x0,%x1,%x2
\n\t
xxsel %x0,%x1,%x2,%x0"
:
"=&wa"
(
ret
)
:
"wa"
(
a
),
"wa"
(
b
));
return
ret
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmax
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
Packet2d
ret
;
__asm__
(
"xvcmpgtdp %x0,%x2,%x1
\n\t
xxsel %x0,%x1,%x2,%x0"
:
"=&wa"
(
ret
)
:
"wa"
(
a
),
"wa"
(
b
));
return
ret
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pand
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_and
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
por
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_or
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pxor
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_xor
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pandnot
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vec_and
(
a
,
vec_nor
(
b
,
b
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pround
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vec_round
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pceil
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vec_ceil
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pfloor
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vec_floor
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
ploadu
<
Packet2d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
(
Packet2d
)
vec_vsx_ld
((
long
)
from
&
15
,
(
const
double
*
)
_EIGEN_ALIGNED_PTR
(
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
ploaddup
<
Packet2d
>
(
const
double
*
from
)
{
Packet2d
p
;
if
((
std
::
ptrdiff_t
(
from
)
%
16
)
==
0
)
p
=
pload
<
Packet2d
>
(
from
);
else
p
=
ploadu
<
Packet2d
>
(
from
);
return
vec_splat_dbl
<
0
>
(
p
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
double
>
(
double
*
to
,
const
Packet2d
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vec_vsx_st
((
Packet4f
)
from
,
(
long
)
to
&
15
,
(
float
*
)
_EIGEN_ALIGNED_PTR
(
to
));
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
double
>
(
const
double
*
addr
)
{
EIGEN_PPC_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
double
pfirst
<
Packet2d
>
(
const
Packet2d
&
a
)
{
double
EIGEN_ALIGN16
x
[
2
];
pstore
<
double
>
(
x
,
a
);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
preverse
(
const
Packet2d
&
a
)
{
return
reinterpret_cast
<
Packet2d
>
(
vec_perm
(
reinterpret_cast
<
Packet16uc
>
(
a
),
reinterpret_cast
<
Packet16uc
>
(
a
),
p16uc_REVERSE64
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pabs
(
const
Packet2d
&
a
)
{
return
vec_abs
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
double
predux
<
Packet2d
>
(
const
Packet2d
&
a
)
{
Packet2d
b
,
sum
;
b
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4f
>
(
a
),
reinterpret_cast
<
Packet4f
>
(
a
),
8
));
sum
=
a
+
b
;
return
pfirst
<
Packet2d
>
(
sum
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
preduxp
<
Packet2d
>
(
const
Packet2d
*
vecs
)
{
Packet2d
v
[
2
],
sum
;
v
[
0
]
=
vecs
[
0
]
+
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4f
>
(
vecs
[
0
]),
reinterpret_cast
<
Packet4f
>
(
vecs
[
0
]),
8
));
v
[
1
]
=
vecs
[
1
]
+
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4f
>
(
vecs
[
1
]),
reinterpret_cast
<
Packet4f
>
(
vecs
[
1
]),
8
));
#ifdef _BIG_ENDIAN
sum
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4f
>
(
v
[
0
]),
reinterpret_cast
<
Packet4f
>
(
v
[
1
]),
8
));
#else
sum
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4f
>
(
v
[
1
]),
reinterpret_cast
<
Packet4f
>
(
v
[
0
]),
8
));
#endif
return
sum
;
}
// Other reduction functions:
// mul
template
<
>
EIGEN_STRONG_INLINE
double
predux_mul
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
pfirst
(
pmul
(
a
,
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
a
),
reinterpret_cast
<
Packet4ui
>
(
a
),
8
))));
}
// min
template
<
>
EIGEN_STRONG_INLINE
double
predux_min
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
pfirst
(
pmin
(
a
,
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
a
),
reinterpret_cast
<
Packet4ui
>
(
a
),
8
))));
}
// max
template
<
>
EIGEN_STRONG_INLINE
double
predux_max
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
pfirst
(
pmax
(
a
,
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
a
),
reinterpret_cast
<
Packet4ui
>
(
a
),
8
))));
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet2d
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet2d
&
first
,
const
Packet2d
&
second
)
{
if
(
Offset
==
1
)
#ifdef _BIG_ENDIAN
first
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
first
),
reinterpret_cast
<
Packet4ui
>
(
second
),
8
));
#else
first
=
reinterpret_cast
<
Packet2d
>
(
vec_sld
(
reinterpret_cast
<
Packet4ui
>
(
second
),
reinterpret_cast
<
Packet4ui
>
(
first
),
8
));
#endif
}
};
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet2d
,
2
>&
kernel
)
{
Packet2d
t0
,
t1
;
t0
=
vec_perm
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
],
p16uc_TRANSPOSE64_HI
);
t1
=
vec_perm
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
],
p16uc_TRANSPOSE64_LO
);
kernel
.
packet
[
0
]
=
t0
;
kernel
.
packet
[
1
]
=
t1
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pblend
(
const
Selector
<
2
>&
ifPacket
,
const
Packet2d
&
thenPacket
,
const
Packet2d
&
elsePacket
)
{
Packet2l
select
=
{
ifPacket
.
select
[
0
],
ifPacket
.
select
[
1
]
};
Packet2bl
mask
=
reinterpret_cast
<
Packet2bl
>
(
vec_cmpeq
(
reinterpret_cast
<
Packet2d
>
(
select
),
reinterpret_cast
<
Packet2d
>
(
p2l_ONE
))
);
return
vec_sel
(
elsePacket
,
thenPacket
,
mask
);
}
#endif // __VSX__
}
// end namespace internal
}
// end namespace Eigen
...
...
pydensecrf/densecrf/include/Eigen/src/Core/arch/CUDA/Complex.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_CUDA_H
#define EIGEN_COMPLEX_CUDA_H
// clang-format off
namespace
Eigen
{
namespace
internal
{
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
// Many std::complex methods such as operator+, operator-, operator* and
// operator/ are not constexpr. Due to this, clang does not treat them as device
// functions and thus Eigen functors making use of these operators fail to
// compile. Here, we manually specialize these functors for complex types when
// building for CUDA to avoid non-constexpr methods.
// Sum
template
<
typename
T
>
struct
scalar_sum_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
:
binary_op_base
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{
typedef
typename
std
::
complex
<
T
>
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_sum_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
std
::
complex
<
T
>
operator
()
(
const
std
::
complex
<
T
>&
a
,
const
std
::
complex
<
T
>&
b
)
const
{
return
std
::
complex
<
T
>
(
numext
::
real
(
a
)
+
numext
::
real
(
b
),
numext
::
imag
(
a
)
+
numext
::
imag
(
b
));
}
};
template
<
typename
T
>
struct
scalar_sum_op
<
std
::
complex
<
T
>
,
std
::
complex
<
T
>
>
:
scalar_sum_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{};
// Difference
template
<
typename
T
>
struct
scalar_difference_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
:
binary_op_base
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{
typedef
typename
std
::
complex
<
T
>
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_difference_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
std
::
complex
<
T
>
operator
()
(
const
std
::
complex
<
T
>&
a
,
const
std
::
complex
<
T
>&
b
)
const
{
return
std
::
complex
<
T
>
(
numext
::
real
(
a
)
-
numext
::
real
(
b
),
numext
::
imag
(
a
)
-
numext
::
imag
(
b
));
}
};
template
<
typename
T
>
struct
scalar_difference_op
<
std
::
complex
<
T
>
,
std
::
complex
<
T
>
>
:
scalar_difference_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{};
// Product
template
<
typename
T
>
struct
scalar_product_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
:
binary_op_base
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{
enum
{
Vectorizable
=
packet_traits
<
std
::
complex
<
T
>>::
HasMul
};
typedef
typename
std
::
complex
<
T
>
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_product_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
std
::
complex
<
T
>
operator
()
(
const
std
::
complex
<
T
>&
a
,
const
std
::
complex
<
T
>&
b
)
const
{
const
T
a_real
=
numext
::
real
(
a
);
const
T
a_imag
=
numext
::
imag
(
a
);
const
T
b_real
=
numext
::
real
(
b
);
const
T
b_imag
=
numext
::
imag
(
b
);
return
std
::
complex
<
T
>
(
a_real
*
b_real
-
a_imag
*
b_imag
,
a_real
*
b_imag
+
a_imag
*
b_real
);
}
};
template
<
typename
T
>
struct
scalar_product_op
<
std
::
complex
<
T
>
,
std
::
complex
<
T
>
>
:
scalar_product_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{};
// Quotient
template
<
typename
T
>
struct
scalar_quotient_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
:
binary_op_base
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{
enum
{
Vectorizable
=
packet_traits
<
std
::
complex
<
T
>>::
HasDiv
};
typedef
typename
std
::
complex
<
T
>
result_type
;
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_quotient_op
)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
std
::
complex
<
T
>
operator
()
(
const
std
::
complex
<
T
>&
a
,
const
std
::
complex
<
T
>&
b
)
const
{
const
T
a_real
=
numext
::
real
(
a
);
const
T
a_imag
=
numext
::
imag
(
a
);
const
T
b_real
=
numext
::
real
(
b
);
const
T
b_imag
=
numext
::
imag
(
b
);
const
T
norm
=
T
(
1
)
/
(
b_real
*
b_real
+
b_imag
*
b_imag
);
return
std
::
complex
<
T
>
((
a_real
*
b_real
+
a_imag
*
b_imag
)
*
norm
,
(
a_imag
*
b_real
-
a_real
*
b_imag
)
*
norm
);
}
};
template
<
typename
T
>
struct
scalar_quotient_op
<
std
::
complex
<
T
>
,
std
::
complex
<
T
>
>
:
scalar_quotient_op
<
const
std
::
complex
<
T
>
,
const
std
::
complex
<
T
>
>
{};
#endif
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_COMPLEX_CUDA_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/CUDA/Half.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
// The conversion routines are Copyright (c) Fabian Giesen, 2016.
// The original license follows:
//
// Copyright (c) Fabian Giesen, 2016
// All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Standard 16-bit float type, mostly useful for GPUs. Defines a new
// type Eigen::half (inheriting from CUDA's __half struct) with
// operator overloads such that it behaves basically as an arithmetic
// type. It will be quite slow on CPUs (so it is recommended to stay
// in float32_bits for CPUs, except for simple parameter conversions, I/O
// to disk and the likes), but fast on GPUs.
#ifndef EIGEN_HALF_CUDA_H
#define EIGEN_HALF_CUDA_H
#if __cplusplus > 199711L
#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
#else
#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
#endif
#include <sstream>
namespace
Eigen
{
struct
half
;
namespace
half_impl
{
#if !defined(EIGEN_HAS_CUDA_FP16)
// Make our own __half_raw definition that is similar to CUDA's.
struct
__half_raw
{
EIGEN_DEVICE_FUNC
__half_raw
()
:
x
(
0
)
{}
explicit
EIGEN_DEVICE_FUNC
__half_raw
(
unsigned
short
raw
)
:
x
(
raw
)
{}
unsigned
short
x
;
};
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
typedef
__half
__half_raw
;
#endif
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
__half_raw
raw_uint16_to_half
(
unsigned
short
x
);
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
__half_raw
float_to_half_rtne
(
float
ff
);
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
float
half_to_float
(
__half_raw
h
);
struct
half_base
:
public
__half_raw
{
EIGEN_DEVICE_FUNC
half_base
()
{}
EIGEN_DEVICE_FUNC
half_base
(
const
half_base
&
h
)
:
__half_raw
(
h
)
{}
EIGEN_DEVICE_FUNC
half_base
(
const
__half_raw
&
h
)
:
__half_raw
(
h
)
{}
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
EIGEN_DEVICE_FUNC
half_base
(
const
__half
&
h
)
:
__half_raw
(
*
(
__half_raw
*
)
&
h
)
{}
#endif
};
}
// namespace half_impl
// Class definition.
struct
half
:
public
half_impl
::
half_base
{
#if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
typedef
half_impl
::
__half_raw
__half_raw
;
#endif
EIGEN_DEVICE_FUNC
half
()
{}
EIGEN_DEVICE_FUNC
half
(
const
__half_raw
&
h
)
:
half_impl
::
half_base
(
h
)
{}
EIGEN_DEVICE_FUNC
half
(
const
half
&
h
)
:
half_impl
::
half_base
(
h
)
{}
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
EIGEN_DEVICE_FUNC
half
(
const
__half
&
h
)
:
half_impl
::
half_base
(
h
)
{}
#endif
explicit
EIGEN_DEVICE_FUNC
half
(
bool
b
)
:
half_impl
::
half_base
(
half_impl
::
raw_uint16_to_half
(
b
?
0x3c00
:
0
))
{}
template
<
class
T
>
explicit
EIGEN_DEVICE_FUNC
half
(
const
T
&
val
)
:
half_impl
::
half_base
(
half_impl
::
float_to_half_rtne
(
static_cast
<
float
>
(
val
)))
{}
explicit
EIGEN_DEVICE_FUNC
half
(
float
f
)
:
half_impl
::
half_base
(
half_impl
::
float_to_half_rtne
(
f
))
{}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
bool
)
const
{
// +0.0 and -0.0 become false, everything else becomes true.
return
(
x
&
0x7fff
)
!=
0
;
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
signed
char
)
const
{
return
static_cast
<
signed
char
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
unsigned
char
)
const
{
return
static_cast
<
unsigned
char
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
short
)
const
{
return
static_cast
<
short
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
unsigned
short
)
const
{
return
static_cast
<
unsigned
short
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
int
)
const
{
return
static_cast
<
int
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
unsigned
int
)
const
{
return
static_cast
<
unsigned
int
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
long
)
const
{
return
static_cast
<
long
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
unsigned
long
)
const
{
return
static_cast
<
unsigned
long
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
long
long
)
const
{
return
static_cast
<
long
long
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
unsigned
long
long
)
const
{
return
static_cast
<
unsigned
long
long
>
(
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
float
)
const
{
return
half_impl
::
half_to_float
(
*
this
);
}
EIGEN_DEVICE_FUNC
EIGEN_EXPLICIT_CAST
(
double
)
const
{
return
static_cast
<
double
>
(
half_impl
::
half_to_float
(
*
this
));
}
EIGEN_DEVICE_FUNC
half
&
operator
=
(
const
half
&
other
)
{
x
=
other
.
x
;
return
*
this
;
}
};
}
// end namespace Eigen
namespace
std
{
template
<
>
struct
numeric_limits
<
Eigen
::
half
>
{
static
const
bool
is_specialized
=
true
;
static
const
bool
is_signed
=
true
;
static
const
bool
is_integer
=
false
;
static
const
bool
is_exact
=
false
;
static
const
bool
has_infinity
=
true
;
static
const
bool
has_quiet_NaN
=
true
;
static
const
bool
has_signaling_NaN
=
true
;
static
const
float_denorm_style
has_denorm
=
denorm_present
;
static
const
bool
has_denorm_loss
=
false
;
static
const
std
::
float_round_style
round_style
=
std
::
round_to_nearest
;
static
const
bool
is_iec559
=
false
;
static
const
bool
is_bounded
=
false
;
static
const
bool
is_modulo
=
false
;
static
const
int
digits
=
11
;
static
const
int
digits10
=
3
;
// according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static
const
int
max_digits10
=
5
;
// according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static
const
int
radix
=
2
;
static
const
int
min_exponent
=
-
13
;
static
const
int
min_exponent10
=
-
4
;
static
const
int
max_exponent
=
16
;
static
const
int
max_exponent10
=
4
;
static
const
bool
traps
=
true
;
static
const
bool
tinyness_before
=
false
;
static
Eigen
::
half
(
min
)()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0x400
);
}
static
Eigen
::
half
lowest
()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0xfbff
);
}
static
Eigen
::
half
(
max
)()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0x7bff
);
}
static
Eigen
::
half
epsilon
()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0x0800
);
}
static
Eigen
::
half
round_error
()
{
return
Eigen
::
half
(
0.5
);
}
static
Eigen
::
half
infinity
()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0x7c00
);
}
static
Eigen
::
half
quiet_NaN
()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0x7e00
);
}
static
Eigen
::
half
signaling_NaN
()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0x7e00
);
}
static
Eigen
::
half
denorm_min
()
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
0x1
);
}
};
// If std::numeric_limits<T> is specialized, should also specialize
// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
// std::numeric_limits<const volatile T>
// https://stackoverflow.com/a/16519653/
template
<
>
struct
numeric_limits
<
const
Eigen
::
half
>
:
numeric_limits
<
Eigen
::
half
>
{};
template
<
>
struct
numeric_limits
<
volatile
Eigen
::
half
>
:
numeric_limits
<
Eigen
::
half
>
{};
template
<
>
struct
numeric_limits
<
const
volatile
Eigen
::
half
>
:
numeric_limits
<
Eigen
::
half
>
{};
}
// end namespace std
namespace
Eigen
{
namespace
half_impl
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
// Intrinsics for native fp16 support. Note that on current hardware,
// these are no faster than float32_bits arithmetic (you need to use the half2
// versions to get the ALU speed increased), but you do save the
// conversion steps back and forth.
EIGEN_STRONG_INLINE
__device__
half
operator
+
(
const
half
&
a
,
const
half
&
b
)
{
return
__hadd
(
a
,
b
);
}
EIGEN_STRONG_INLINE
__device__
half
operator
*
(
const
half
&
a
,
const
half
&
b
)
{
return
__hmul
(
a
,
b
);
}
EIGEN_STRONG_INLINE
__device__
half
operator
-
(
const
half
&
a
,
const
half
&
b
)
{
return
__hsub
(
a
,
b
);
}
EIGEN_STRONG_INLINE
__device__
half
operator
/
(
const
half
&
a
,
const
half
&
b
)
{
float
num
=
__half2float
(
a
);
float
denom
=
__half2float
(
b
);
return
__float2half
(
num
/
denom
);
}
EIGEN_STRONG_INLINE
__device__
half
operator
-
(
const
half
&
a
)
{
return
__hneg
(
a
);
}
EIGEN_STRONG_INLINE
__device__
half
&
operator
+=
(
half
&
a
,
const
half
&
b
)
{
a
=
a
+
b
;
return
a
;
}
EIGEN_STRONG_INLINE
__device__
half
&
operator
*=
(
half
&
a
,
const
half
&
b
)
{
a
=
a
*
b
;
return
a
;
}
EIGEN_STRONG_INLINE
__device__
half
&
operator
-=
(
half
&
a
,
const
half
&
b
)
{
a
=
a
-
b
;
return
a
;
}
EIGEN_STRONG_INLINE
__device__
half
&
operator
/=
(
half
&
a
,
const
half
&
b
)
{
a
=
a
/
b
;
return
a
;
}
EIGEN_STRONG_INLINE
__device__
bool
operator
==
(
const
half
&
a
,
const
half
&
b
)
{
return
__heq
(
a
,
b
);
}
EIGEN_STRONG_INLINE
__device__
bool
operator
!=
(
const
half
&
a
,
const
half
&
b
)
{
return
__hne
(
a
,
b
);
}
EIGEN_STRONG_INLINE
__device__
bool
operator
<
(
const
half
&
a
,
const
half
&
b
)
{
return
__hlt
(
a
,
b
);
}
EIGEN_STRONG_INLINE
__device__
bool
operator
<=
(
const
half
&
a
,
const
half
&
b
)
{
return
__hle
(
a
,
b
);
}
EIGEN_STRONG_INLINE
__device__
bool
operator
>
(
const
half
&
a
,
const
half
&
b
)
{
return
__hgt
(
a
,
b
);
}
EIGEN_STRONG_INLINE
__device__
bool
operator
>=
(
const
half
&
a
,
const
half
&
b
)
{
return
__hge
(
a
,
b
);
}
#else // Emulate support for half floats
// Definitions for CPUs and older CUDA, mostly working through conversion
// to/from float32_bits.
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
operator
+
(
const
half
&
a
,
const
half
&
b
)
{
return
half
(
float
(
a
)
+
float
(
b
));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
operator
*
(
const
half
&
a
,
const
half
&
b
)
{
return
half
(
float
(
a
)
*
float
(
b
));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
operator
-
(
const
half
&
a
,
const
half
&
b
)
{
return
half
(
float
(
a
)
-
float
(
b
));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
operator
/
(
const
half
&
a
,
const
half
&
b
)
{
return
half
(
float
(
a
)
/
float
(
b
));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
operator
-
(
const
half
&
a
)
{
half
result
;
result
.
x
=
a
.
x
^
0x8000
;
return
result
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
&
operator
+=
(
half
&
a
,
const
half
&
b
)
{
a
=
half
(
float
(
a
)
+
float
(
b
));
return
a
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
&
operator
*=
(
half
&
a
,
const
half
&
b
)
{
a
=
half
(
float
(
a
)
*
float
(
b
));
return
a
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
&
operator
-=
(
half
&
a
,
const
half
&
b
)
{
a
=
half
(
float
(
a
)
-
float
(
b
));
return
a
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
&
operator
/=
(
half
&
a
,
const
half
&
b
)
{
a
=
half
(
float
(
a
)
/
float
(
b
));
return
a
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
operator
==
(
const
half
&
a
,
const
half
&
b
)
{
return
numext
::
equal_strict
(
float
(
a
),
float
(
b
));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
operator
!=
(
const
half
&
a
,
const
half
&
b
)
{
return
numext
::
not_equal_strict
(
float
(
a
),
float
(
b
));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
operator
<
(
const
half
&
a
,
const
half
&
b
)
{
return
float
(
a
)
<
float
(
b
);
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
operator
<=
(
const
half
&
a
,
const
half
&
b
)
{
return
float
(
a
)
<=
float
(
b
);
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
operator
>
(
const
half
&
a
,
const
half
&
b
)
{
return
float
(
a
)
>
float
(
b
);
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
operator
>=
(
const
half
&
a
,
const
half
&
b
)
{
return
float
(
a
)
>=
float
(
b
);
}
#endif // Emulate support for half floats
// Division by an index. Do it in full float precision to avoid accuracy
// issues in converting the denominator to half.
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
operator
/
(
const
half
&
a
,
Index
b
)
{
return
half
(
static_cast
<
float
>
(
a
)
/
static_cast
<
float
>
(
b
));
}
// Conversion routines, including fallbacks for the host or older CUDA.
// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
// these in hardware. If we need more performance on older/other CPUs, they are
// also possible to vectorize directly.
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
__half_raw
raw_uint16_to_half
(
unsigned
short
x
)
{
__half_raw
h
;
h
.
x
=
x
;
return
h
;
}
union
float32_bits
{
unsigned
int
u
;
float
f
;
};
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
__half_raw
float_to_half_rtne
(
float
ff
)
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
__half
tmp_ff
=
__float2half
(
ff
);
return
*
(
__half_raw
*
)
&
tmp_ff
;
#elif defined(EIGEN_HAS_FP16_C)
__half_raw
h
;
h
.
x
=
_cvtss_sh
(
ff
,
0
);
return
h
;
#else
float32_bits
f
;
f
.
f
=
ff
;
const
float32_bits
f32infty
=
{
255
<<
23
};
const
float32_bits
f16max
=
{
(
127
+
16
)
<<
23
};
const
float32_bits
denorm_magic
=
{
((
127
-
15
)
+
(
23
-
10
)
+
1
)
<<
23
};
unsigned
int
sign_mask
=
0x80000000u
;
__half_raw
o
;
o
.
x
=
static_cast
<
unsigned
short
>
(
0x0u
);
unsigned
int
sign
=
f
.
u
&
sign_mask
;
f
.
u
^=
sign
;
// NOTE all the integer compares in this function can be safely
// compiled into signed compares since all operands are below
// 0x80000000. Important if you want fast straight SSE2 code
// (since there's no unsigned PCMPGTD).
if
(
f
.
u
>=
f16max
.
u
)
{
// result is Inf or NaN (all exponent bits set)
o
.
x
=
(
f
.
u
>
f32infty
.
u
)
?
0x7e00
:
0x7c00
;
// NaN->qNaN and Inf->Inf
}
else
{
// (De)normalized number or zero
if
(
f
.
u
<
(
113
<<
23
))
{
// resulting FP16 is subnormal or zero
// use a magic value to align our 10 mantissa bits at the bottom of
// the float. as long as FP addition is round-to-nearest-even this
// just works.
f
.
f
+=
denorm_magic
.
f
;
// and one integer subtract of the bias later, we have our final float!
o
.
x
=
static_cast
<
unsigned
short
>
(
f
.
u
-
denorm_magic
.
u
);
}
else
{
unsigned
int
mant_odd
=
(
f
.
u
>>
13
)
&
1
;
// resulting mantissa is odd
// update exponent, rounding bias part 1
f
.
u
+=
((
unsigned
int
)(
15
-
127
)
<<
23
)
+
0xfff
;
// rounding bias part 2
f
.
u
+=
mant_odd
;
// take the bits!
o
.
x
=
static_cast
<
unsigned
short
>
(
f
.
u
>>
13
);
}
}
o
.
x
|=
static_cast
<
unsigned
short
>
(
sign
>>
16
);
return
o
;
#endif
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
float
half_to_float
(
__half_raw
h
)
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
return
__half2float
(
h
);
#elif defined(EIGEN_HAS_FP16_C)
return
_cvtsh_ss
(
h
.
x
);
#else
const
float32_bits
magic
=
{
113
<<
23
};
const
unsigned
int
shifted_exp
=
0x7c00
<<
13
;
// exponent mask after shift
float32_bits
o
;
o
.
u
=
(
h
.
x
&
0x7fff
)
<<
13
;
// exponent/mantissa bits
unsigned
int
exp
=
shifted_exp
&
o
.
u
;
// just the exponent
o
.
u
+=
(
127
-
15
)
<<
23
;
// exponent adjust
// handle exponent special cases
if
(
exp
==
shifted_exp
)
{
// Inf/NaN?
o
.
u
+=
(
128
-
16
)
<<
23
;
// extra exp adjust
}
else
if
(
exp
==
0
)
{
// Zero/Denormal?
o
.
u
+=
1
<<
23
;
// extra exp adjust
o
.
f
-=
magic
.
f
;
// renormalize
}
o
.
u
|=
(
h
.
x
&
0x8000
)
<<
16
;
// sign bit
return
o
.
f
;
#endif
}
// --- standard functions ---
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
(
isinf
)(
const
half
&
a
)
{
return
(
a
.
x
&
0x7fff
)
==
0x7c00
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
(
isnan
)(
const
half
&
a
)
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return
__hisnan
(
a
);
#else
return
(
a
.
x
&
0x7fff
)
>
0x7c00
;
#endif
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
bool
(
isfinite
)(
const
half
&
a
)
{
return
!
(
isinf
EIGEN_NOT_A_MACRO
(
a
))
&&
!
(
isnan
EIGEN_NOT_A_MACRO
(
a
));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
abs
(
const
half
&
a
)
{
half
result
;
result
.
x
=
a
.
x
&
0x7FFF
;
return
result
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
exp
(
const
half
&
a
)
{
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
return
half
(
hexp
(
a
));
#else
return
half
(
::
expf
(
float
(
a
)));
#endif
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
log
(
const
half
&
a
)
{
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return
half
(
::
hlog
(
a
));
#else
return
half
(
::
logf
(
float
(
a
)));
#endif
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
log1p
(
const
half
&
a
)
{
return
half
(
numext
::
log1p
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
log10
(
const
half
&
a
)
{
return
half
(
::
log10f
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
sqrt
(
const
half
&
a
)
{
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
return
half
(
hsqrt
(
a
));
#else
return
half
(
::
sqrtf
(
float
(
a
)));
#endif
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
pow
(
const
half
&
a
,
const
half
&
b
)
{
return
half
(
::
powf
(
float
(
a
),
float
(
b
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
sin
(
const
half
&
a
)
{
return
half
(
::
sinf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
cos
(
const
half
&
a
)
{
return
half
(
::
cosf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
tan
(
const
half
&
a
)
{
return
half
(
::
tanf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
tanh
(
const
half
&
a
)
{
return
half
(
::
tanhf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
floor
(
const
half
&
a
)
{
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
return
half
(
hfloor
(
a
));
#else
return
half
(
::
floorf
(
float
(
a
)));
#endif
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
ceil
(
const
half
&
a
)
{
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
return
half
(
hceil
(
a
));
#else
return
half
(
::
ceilf
(
float
(
a
)));
#endif
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
(
min
)(
const
half
&
a
,
const
half
&
b
)
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return
__hlt
(
b
,
a
)
?
b
:
a
;
#else
const
float
f1
=
static_cast
<
float
>
(
a
);
const
float
f2
=
static_cast
<
float
>
(
b
);
return
f2
<
f1
?
b
:
a
;
#endif
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
half
(
max
)(
const
half
&
a
,
const
half
&
b
)
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return
__hlt
(
a
,
b
)
?
b
:
a
;
#else
const
float
f1
=
static_cast
<
float
>
(
a
);
const
float
f2
=
static_cast
<
float
>
(
b
);
return
f1
<
f2
?
b
:
a
;
#endif
}
EIGEN_ALWAYS_INLINE
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
half
&
v
)
{
os
<<
static_cast
<
float
>
(
v
);
return
os
;
}
}
// end namespace half_impl
// import Eigen::half_impl::half into Eigen namespace
// using half_impl::half;
namespace
internal
{
template
<
>
struct
random_default_impl
<
half
,
false
,
false
>
{
static
inline
half
run
(
const
half
&
x
,
const
half
&
y
)
{
return
x
+
(
y
-
x
)
*
half
(
float
(
std
::
rand
())
/
float
(
RAND_MAX
));
}
static
inline
half
run
()
{
return
run
(
half
(
-
1.
f
),
half
(
1.
f
));
}
};
template
<
>
struct
is_arithmetic
<
half
>
{
enum
{
value
=
true
};
};
}
// end namespace internal
template
<
>
struct
NumTraits
<
Eigen
::
half
>
:
GenericNumTraits
<
Eigen
::
half
>
{
enum
{
IsSigned
=
true
,
IsInteger
=
false
,
IsComplex
=
false
,
RequireInitialization
=
false
};
EIGEN_DEVICE_FUNC
static
EIGEN_STRONG_INLINE
Eigen
::
half
epsilon
()
{
return
half_impl
::
raw_uint16_to_half
(
0x0800
);
}
EIGEN_DEVICE_FUNC
static
EIGEN_STRONG_INLINE
Eigen
::
half
dummy_precision
()
{
return
Eigen
::
half
(
1e-2
f
);
}
EIGEN_DEVICE_FUNC
static
EIGEN_STRONG_INLINE
Eigen
::
half
highest
()
{
return
half_impl
::
raw_uint16_to_half
(
0x7bff
);
}
EIGEN_DEVICE_FUNC
static
EIGEN_STRONG_INLINE
Eigen
::
half
lowest
()
{
return
half_impl
::
raw_uint16_to_half
(
0xfbff
);
}
EIGEN_DEVICE_FUNC
static
EIGEN_STRONG_INLINE
Eigen
::
half
infinity
()
{
return
half_impl
::
raw_uint16_to_half
(
0x7c00
);
}
EIGEN_DEVICE_FUNC
static
EIGEN_STRONG_INLINE
Eigen
::
half
quiet_NaN
()
{
return
half_impl
::
raw_uint16_to_half
(
0x7c01
);
}
};
}
// end namespace Eigen
// C-like standard mathematical functions and trancendentals.
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
fabsh
(
const
Eigen
::
half
&
a
)
{
Eigen
::
half
result
;
result
.
x
=
a
.
x
&
0x7FFF
;
return
result
;
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
exph
(
const
Eigen
::
half
&
a
)
{
return
Eigen
::
half
(
::
expf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
logh
(
const
Eigen
::
half
&
a
)
{
#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return
Eigen
::
half
(
::
hlog
(
a
));
#else
return
Eigen
::
half
(
::
logf
(
float
(
a
)));
#endif
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
sqrth
(
const
Eigen
::
half
&
a
)
{
return
Eigen
::
half
(
::
sqrtf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
powh
(
const
Eigen
::
half
&
a
,
const
Eigen
::
half
&
b
)
{
return
Eigen
::
half
(
::
powf
(
float
(
a
),
float
(
b
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
floorh
(
const
Eigen
::
half
&
a
)
{
return
Eigen
::
half
(
::
floorf
(
float
(
a
)));
}
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
ceilh
(
const
Eigen
::
half
&
a
)
{
return
Eigen
::
half
(
::
ceilf
(
float
(
a
)));
}
namespace
std
{
#if __cplusplus > 199711L
template
<
>
struct
hash
<
Eigen
::
half
>
{
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
std
::
size_t
operator
()(
const
Eigen
::
half
&
a
)
const
{
return
static_cast
<
std
::
size_t
>
(
a
.
x
);
}
};
#endif
}
// end namespace std
// Add the missing shfl_xor intrinsic
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
__device__
EIGEN_STRONG_INLINE
Eigen
::
half
__shfl_xor
(
Eigen
::
half
var
,
int
laneMask
,
int
width
=
warpSize
)
{
#if EIGEN_CUDACC_VER < 90000
return
static_cast
<
Eigen
::
half
>
(
__shfl_xor
(
static_cast
<
float
>
(
var
),
laneMask
,
width
));
#else
return
static_cast
<
Eigen
::
half
>
(
__shfl_xor_sync
(
0xFFFFFFFF
,
static_cast
<
float
>
(
var
),
laneMask
,
width
));
#endif
}
#endif
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC
Eigen
::
half
__ldg
(
const
Eigen
::
half
*
ptr
)
{
return
Eigen
::
half_impl
::
raw_uint16_to_half
(
__ldg
(
reinterpret_cast
<
const
unsigned
short
*>
(
ptr
)));
}
#endif
#if defined(EIGEN_CUDA_ARCH)
namespace
Eigen
{
namespace
numext
{
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
bool
(
isnan
)(
const
Eigen
::
half
&
h
)
{
return
(
half_impl
::
isnan
)(
h
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
bool
(
isinf
)(
const
Eigen
::
half
&
h
)
{
return
(
half_impl
::
isinf
)(
h
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
bool
(
isfinite
)(
const
Eigen
::
half
&
h
)
{
return
(
half_impl
::
isfinite
)(
h
);
}
}
// namespace Eigen
}
// namespace numext
#endif
#endif // EIGEN_HALF_CUDA_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/CUDA/MathFunctions.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
#define EIGEN_MATH_FUNCTIONS_CUDA_H
namespace
Eigen
{
namespace
internal
{
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
plog
<
float4
>
(
const
float4
&
a
)
{
return
make_float4
(
logf
(
a
.
x
),
logf
(
a
.
y
),
logf
(
a
.
z
),
logf
(
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
plog
<
double2
>
(
const
double2
&
a
)
{
using
::
log
;
return
make_double2
(
log
(
a
.
x
),
log
(
a
.
y
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
plog1p
<
float4
>
(
const
float4
&
a
)
{
return
make_float4
(
log1pf
(
a
.
x
),
log1pf
(
a
.
y
),
log1pf
(
a
.
z
),
log1pf
(
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
plog1p
<
double2
>
(
const
double2
&
a
)
{
return
make_double2
(
log1p
(
a
.
x
),
log1p
(
a
.
y
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pexp
<
float4
>
(
const
float4
&
a
)
{
return
make_float4
(
expf
(
a
.
x
),
expf
(
a
.
y
),
expf
(
a
.
z
),
expf
(
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pexp
<
double2
>
(
const
double2
&
a
)
{
using
::
exp
;
return
make_double2
(
exp
(
a
.
x
),
exp
(
a
.
y
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
psqrt
<
float4
>
(
const
float4
&
a
)
{
return
make_float4
(
sqrtf
(
a
.
x
),
sqrtf
(
a
.
y
),
sqrtf
(
a
.
z
),
sqrtf
(
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
psqrt
<
double2
>
(
const
double2
&
a
)
{
using
::
sqrt
;
return
make_double2
(
sqrt
(
a
.
x
),
sqrt
(
a
.
y
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
prsqrt
<
float4
>
(
const
float4
&
a
)
{
return
make_float4
(
rsqrtf
(
a
.
x
),
rsqrtf
(
a
.
y
),
rsqrtf
(
a
.
z
),
rsqrtf
(
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
prsqrt
<
double2
>
(
const
double2
&
a
)
{
return
make_double2
(
rsqrt
(
a
.
x
),
rsqrt
(
a
.
y
));
}
#endif
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/CUDA/PacketMath.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_CUDA_H
#define EIGEN_PACKET_MATH_CUDA_H
namespace
Eigen
{
namespace
internal
{
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
template
<
>
struct
is_arithmetic
<
float4
>
{
enum
{
value
=
true
};
};
template
<
>
struct
is_arithmetic
<
double2
>
{
enum
{
value
=
true
};
};
template
<
>
struct
packet_traits
<
float
>
:
default_packet_traits
{
typedef
float4
type
;
typedef
float4
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
,
HasHalfPacket
=
0
,
HasDiv
=
1
,
HasSin
=
0
,
HasCos
=
0
,
HasLog
=
1
,
HasExp
=
1
,
HasSqrt
=
1
,
HasRsqrt
=
1
,
HasLGamma
=
1
,
HasDiGamma
=
1
,
HasZeta
=
1
,
HasPolygamma
=
1
,
HasErf
=
1
,
HasErfc
=
1
,
HasIGamma
=
1
,
HasIGammac
=
1
,
HasBetaInc
=
1
,
HasBlend
=
0
,
};
};
template
<
>
struct
packet_traits
<
double
>
:
default_packet_traits
{
typedef
double2
type
;
typedef
double2
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
0
,
HasDiv
=
1
,
HasLog
=
1
,
HasExp
=
1
,
HasSqrt
=
1
,
HasRsqrt
=
1
,
HasLGamma
=
1
,
HasDiGamma
=
1
,
HasZeta
=
1
,
HasPolygamma
=
1
,
HasErf
=
1
,
HasErfc
=
1
,
HasIGamma
=
1
,
HasIGammac
=
1
,
HasBetaInc
=
1
,
HasBlend
=
0
,
};
};
template
<
>
struct
unpacket_traits
<
float4
>
{
typedef
float
type
;
enum
{
size
=
4
,
alignment
=
Aligned16
};
typedef
float4
half
;
};
template
<
>
struct
unpacket_traits
<
double2
>
{
typedef
double
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
double2
half
;
};
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pset1
<
float4
>
(
const
float
&
from
)
{
return
make_float4
(
from
,
from
,
from
,
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pset1
<
double2
>
(
const
double
&
from
)
{
return
make_double2
(
from
,
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
plset
<
float4
>
(
const
float
&
a
)
{
return
make_float4
(
a
,
a
+
1
,
a
+
2
,
a
+
3
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
plset
<
double2
>
(
const
double
&
a
)
{
return
make_double2
(
a
,
a
+
1
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
padd
<
float4
>
(
const
float4
&
a
,
const
float4
&
b
)
{
return
make_float4
(
a
.
x
+
b
.
x
,
a
.
y
+
b
.
y
,
a
.
z
+
b
.
z
,
a
.
w
+
b
.
w
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
padd
<
double2
>
(
const
double2
&
a
,
const
double2
&
b
)
{
return
make_double2
(
a
.
x
+
b
.
x
,
a
.
y
+
b
.
y
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
psub
<
float4
>
(
const
float4
&
a
,
const
float4
&
b
)
{
return
make_float4
(
a
.
x
-
b
.
x
,
a
.
y
-
b
.
y
,
a
.
z
-
b
.
z
,
a
.
w
-
b
.
w
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
psub
<
double2
>
(
const
double2
&
a
,
const
double2
&
b
)
{
return
make_double2
(
a
.
x
-
b
.
x
,
a
.
y
-
b
.
y
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pnegate
(
const
float4
&
a
)
{
return
make_float4
(
-
a
.
x
,
-
a
.
y
,
-
a
.
z
,
-
a
.
w
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pnegate
(
const
double2
&
a
)
{
return
make_double2
(
-
a
.
x
,
-
a
.
y
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pconj
(
const
float4
&
a
)
{
return
a
;
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pconj
(
const
double2
&
a
)
{
return
a
;
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pmul
<
float4
>
(
const
float4
&
a
,
const
float4
&
b
)
{
return
make_float4
(
a
.
x
*
b
.
x
,
a
.
y
*
b
.
y
,
a
.
z
*
b
.
z
,
a
.
w
*
b
.
w
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pmul
<
double2
>
(
const
double2
&
a
,
const
double2
&
b
)
{
return
make_double2
(
a
.
x
*
b
.
x
,
a
.
y
*
b
.
y
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pdiv
<
float4
>
(
const
float4
&
a
,
const
float4
&
b
)
{
return
make_float4
(
a
.
x
/
b
.
x
,
a
.
y
/
b
.
y
,
a
.
z
/
b
.
z
,
a
.
w
/
b
.
w
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pdiv
<
double2
>
(
const
double2
&
a
,
const
double2
&
b
)
{
return
make_double2
(
a
.
x
/
b
.
x
,
a
.
y
/
b
.
y
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pmin
<
float4
>
(
const
float4
&
a
,
const
float4
&
b
)
{
return
make_float4
(
fminf
(
a
.
x
,
b
.
x
),
fminf
(
a
.
y
,
b
.
y
),
fminf
(
a
.
z
,
b
.
z
),
fminf
(
a
.
w
,
b
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pmin
<
double2
>
(
const
double2
&
a
,
const
double2
&
b
)
{
return
make_double2
(
fmin
(
a
.
x
,
b
.
x
),
fmin
(
a
.
y
,
b
.
y
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pmax
<
float4
>
(
const
float4
&
a
,
const
float4
&
b
)
{
return
make_float4
(
fmaxf
(
a
.
x
,
b
.
x
),
fmaxf
(
a
.
y
,
b
.
y
),
fmaxf
(
a
.
z
,
b
.
z
),
fmaxf
(
a
.
w
,
b
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pmax
<
double2
>
(
const
double2
&
a
,
const
double2
&
b
)
{
return
make_double2
(
fmax
(
a
.
x
,
b
.
x
),
fmax
(
a
.
y
,
b
.
y
));
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pload
<
float4
>
(
const
float
*
from
)
{
return
*
reinterpret_cast
<
const
float4
*>
(
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
pload
<
double2
>
(
const
double
*
from
)
{
return
*
reinterpret_cast
<
const
double2
*>
(
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
ploadu
<
float4
>
(
const
float
*
from
)
{
return
make_float4
(
from
[
0
],
from
[
1
],
from
[
2
],
from
[
3
]);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
double2
ploadu
<
double2
>
(
const
double
*
from
)
{
return
make_double2
(
from
[
0
],
from
[
1
]);
}
template
<
>
EIGEN_STRONG_INLINE
float4
ploaddup
<
float4
>
(
const
float
*
from
)
{
return
make_float4
(
from
[
0
],
from
[
0
],
from
[
1
],
from
[
1
]);
}
template
<
>
EIGEN_STRONG_INLINE
double2
ploaddup
<
double2
>
(
const
double
*
from
)
{
return
make_double2
(
from
[
0
],
from
[
0
]);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
float4
&
from
)
{
*
reinterpret_cast
<
float4
*>
(
to
)
=
from
;
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
void
pstore
<
double
>
(
double
*
to
,
const
double2
&
from
)
{
*
reinterpret_cast
<
double2
*>
(
to
)
=
from
;
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
float4
&
from
)
{
to
[
0
]
=
from
.
x
;
to
[
1
]
=
from
.
y
;
to
[
2
]
=
from
.
z
;
to
[
3
]
=
from
.
w
;
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
void
pstoreu
<
double
>
(
double
*
to
,
const
double2
&
from
)
{
to
[
0
]
=
from
.
x
;
to
[
1
]
=
from
.
y
;
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
float4
ploadt_ro
<
float4
,
Aligned
>
(
const
float
*
from
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
return
__ldg
((
const
float4
*
)
from
);
#else
return
make_float4
(
from
[
0
],
from
[
1
],
from
[
2
],
from
[
3
]);
#endif
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
double2
ploadt_ro
<
double2
,
Aligned
>
(
const
double
*
from
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
return
__ldg
((
const
double2
*
)
from
);
#else
return
make_double2
(
from
[
0
],
from
[
1
]);
#endif
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
float4
ploadt_ro
<
float4
,
Unaligned
>
(
const
float
*
from
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
return
make_float4
(
__ldg
(
from
+
0
),
__ldg
(
from
+
1
),
__ldg
(
from
+
2
),
__ldg
(
from
+
3
));
#else
return
make_float4
(
from
[
0
],
from
[
1
],
from
[
2
],
from
[
3
]);
#endif
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
double2
ploadt_ro
<
double2
,
Unaligned
>
(
const
double
*
from
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
return
make_double2
(
__ldg
(
from
+
0
),
__ldg
(
from
+
1
));
#else
return
make_double2
(
from
[
0
],
from
[
1
]);
#endif
}
template
<
>
EIGEN_DEVICE_FUNC
inline
float4
pgather
<
float
,
float4
>
(
const
float
*
from
,
Index
stride
)
{
return
make_float4
(
from
[
0
*
stride
],
from
[
1
*
stride
],
from
[
2
*
stride
],
from
[
3
*
stride
]);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
double2
pgather
<
double
,
double2
>
(
const
double
*
from
,
Index
stride
)
{
return
make_double2
(
from
[
0
*
stride
],
from
[
1
*
stride
]);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
float
,
float4
>
(
float
*
to
,
const
float4
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
from
.
x
;
to
[
stride
*
1
]
=
from
.
y
;
to
[
stride
*
2
]
=
from
.
z
;
to
[
stride
*
3
]
=
from
.
w
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
double
,
double2
>
(
double
*
to
,
const
double2
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
from
.
x
;
to
[
stride
*
1
]
=
from
.
y
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
float
pfirst
<
float4
>
(
const
float4
&
a
)
{
return
a
.
x
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
double
pfirst
<
double2
>
(
const
double2
&
a
)
{
return
a
.
x
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
float
predux
<
float4
>
(
const
float4
&
a
)
{
return
a
.
x
+
a
.
y
+
a
.
z
+
a
.
w
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
double
predux
<
double2
>
(
const
double2
&
a
)
{
return
a
.
x
+
a
.
y
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
float
predux_max
<
float4
>
(
const
float4
&
a
)
{
return
fmaxf
(
fmaxf
(
a
.
x
,
a
.
y
),
fmaxf
(
a
.
z
,
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
inline
double
predux_max
<
double2
>
(
const
double2
&
a
)
{
return
fmax
(
a
.
x
,
a
.
y
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
float
predux_min
<
float4
>
(
const
float4
&
a
)
{
return
fminf
(
fminf
(
a
.
x
,
a
.
y
),
fminf
(
a
.
z
,
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
inline
double
predux_min
<
double2
>
(
const
double2
&
a
)
{
return
fmin
(
a
.
x
,
a
.
y
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
float
predux_mul
<
float4
>
(
const
float4
&
a
)
{
return
a
.
x
*
a
.
y
*
a
.
z
*
a
.
w
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
double
predux_mul
<
double2
>
(
const
double2
&
a
)
{
return
a
.
x
*
a
.
y
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
float4
pabs
<
float4
>
(
const
float4
&
a
)
{
return
make_float4
(
fabsf
(
a
.
x
),
fabsf
(
a
.
y
),
fabsf
(
a
.
z
),
fabsf
(
a
.
w
));
}
template
<
>
EIGEN_DEVICE_FUNC
inline
double2
pabs
<
double2
>
(
const
double2
&
a
)
{
return
make_double2
(
fabs
(
a
.
x
),
fabs
(
a
.
y
));
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
float4
,
4
>&
kernel
)
{
float
tmp
=
kernel
.
packet
[
0
].
y
;
kernel
.
packet
[
0
].
y
=
kernel
.
packet
[
1
].
x
;
kernel
.
packet
[
1
].
x
=
tmp
;
tmp
=
kernel
.
packet
[
0
].
z
;
kernel
.
packet
[
0
].
z
=
kernel
.
packet
[
2
].
x
;
kernel
.
packet
[
2
].
x
=
tmp
;
tmp
=
kernel
.
packet
[
0
].
w
;
kernel
.
packet
[
0
].
w
=
kernel
.
packet
[
3
].
x
;
kernel
.
packet
[
3
].
x
=
tmp
;
tmp
=
kernel
.
packet
[
1
].
z
;
kernel
.
packet
[
1
].
z
=
kernel
.
packet
[
2
].
y
;
kernel
.
packet
[
2
].
y
=
tmp
;
tmp
=
kernel
.
packet
[
1
].
w
;
kernel
.
packet
[
1
].
w
=
kernel
.
packet
[
3
].
y
;
kernel
.
packet
[
3
].
y
=
tmp
;
tmp
=
kernel
.
packet
[
2
].
w
;
kernel
.
packet
[
2
].
w
=
kernel
.
packet
[
3
].
z
;
kernel
.
packet
[
3
].
z
=
tmp
;
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
double2
,
2
>&
kernel
)
{
double
tmp
=
kernel
.
packet
[
0
].
y
;
kernel
.
packet
[
0
].
y
=
kernel
.
packet
[
1
].
x
;
kernel
.
packet
[
1
].
x
=
tmp
;
}
#endif
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_PACKET_MATH_CUDA_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
#define EIGEN_PACKET_MATH_HALF_CUDA_H
namespace
Eigen
{
namespace
internal
{
// Most of the following operations require arch >= 3.0
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
template
<
>
struct
is_arithmetic
<
half2
>
{
enum
{
value
=
true
};
};
template
<
>
struct
packet_traits
<
Eigen
::
half
>
:
default_packet_traits
{
typedef
half2
type
;
typedef
half2
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasSqrt
=
1
,
HasRsqrt
=
1
,
HasExp
=
1
,
HasLog
=
1
,
HasLog1p
=
1
};
};
template
<
>
struct
unpacket_traits
<
half2
>
{
typedef
Eigen
::
half
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
half2
half
;
};
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pset1
<
half2
>
(
const
Eigen
::
half
&
from
)
{
return
__half2half2
(
from
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pload
<
half2
>
(
const
Eigen
::
half
*
from
)
{
return
*
reinterpret_cast
<
const
half2
*>
(
from
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
ploadu
<
half2
>
(
const
Eigen
::
half
*
from
)
{
return
__halves2half2
(
from
[
0
],
from
[
1
]);
}
template
<
>
EIGEN_STRONG_INLINE
half2
ploaddup
<
half2
>
(
const
Eigen
::
half
*
from
)
{
return
__halves2half2
(
from
[
0
],
from
[
0
]);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
void
pstore
<
Eigen
::
half
>
(
Eigen
::
half
*
to
,
const
half2
&
from
)
{
*
reinterpret_cast
<
half2
*>
(
to
)
=
from
;
}
template
<
>
__device__
EIGEN_STRONG_INLINE
void
pstoreu
<
Eigen
::
half
>
(
Eigen
::
half
*
to
,
const
half2
&
from
)
{
to
[
0
]
=
__low2half
(
from
);
to
[
1
]
=
__high2half
(
from
);
}
template
<
>
__device__
EIGEN_ALWAYS_INLINE
half2
ploadt_ro
<
half2
,
Aligned
>
(
const
Eigen
::
half
*
from
)
{
#if __CUDA_ARCH__ >= 350
return
__ldg
((
const
half2
*
)
from
);
#else
return
__halves2half2
(
*
(
from
+
0
),
*
(
from
+
1
));
#endif
}
template
<
>
__device__
EIGEN_ALWAYS_INLINE
half2
ploadt_ro
<
half2
,
Unaligned
>
(
const
Eigen
::
half
*
from
)
{
#if __CUDA_ARCH__ >= 350
return
__halves2half2
(
__ldg
(
from
+
0
),
__ldg
(
from
+
1
));
#else
return
__halves2half2
(
*
(
from
+
0
),
*
(
from
+
1
));
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pgather
<
Eigen
::
half
,
half2
>
(
const
Eigen
::
half
*
from
,
Index
stride
)
{
return
__halves2half2
(
from
[
0
*
stride
],
from
[
1
*
stride
]);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
void
pscatter
<
Eigen
::
half
,
half2
>
(
Eigen
::
half
*
to
,
const
half2
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
__low2half
(
from
);
to
[
stride
*
1
]
=
__high2half
(
from
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
Eigen
::
half
pfirst
<
half2
>
(
const
half2
&
a
)
{
return
__low2half
(
a
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pabs
<
half2
>
(
const
half2
&
a
)
{
half2
result
;
unsigned
temp
=
*
(
reinterpret_cast
<
const
unsigned
*>
(
&
(
a
)));
*
(
reinterpret_cast
<
unsigned
*>
(
&
(
result
)))
=
temp
&
0x7FFF7FFF
;
return
result
;
}
__device__
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
half2
,
2
>&
kernel
)
{
__half
a1
=
__low2half
(
kernel
.
packet
[
0
]);
__half
a2
=
__high2half
(
kernel
.
packet
[
0
]);
__half
b1
=
__low2half
(
kernel
.
packet
[
1
]);
__half
b2
=
__high2half
(
kernel
.
packet
[
1
]);
kernel
.
packet
[
0
]
=
__halves2half2
(
a1
,
b1
);
kernel
.
packet
[
1
]
=
__halves2half2
(
a2
,
b2
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
plset
<
half2
>
(
const
Eigen
::
half
&
a
)
{
#if __CUDA_ARCH__ >= 530
return
__halves2half2
(
a
,
__hadd
(
a
,
__float2half
(
1.0
f
)));
#else
float
f
=
__half2float
(
a
)
+
1.0
f
;
return
__halves2half2
(
a
,
__float2half
(
f
));
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
padd
<
half2
>
(
const
half2
&
a
,
const
half2
&
b
)
{
#if __CUDA_ARCH__ >= 530
return
__hadd2
(
a
,
b
);
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
b1
=
__low2float
(
b
);
float
b2
=
__high2float
(
b
);
float
r1
=
a1
+
b1
;
float
r2
=
a2
+
b2
;
return
__floats2half2_rn
(
r1
,
r2
);
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
psub
<
half2
>
(
const
half2
&
a
,
const
half2
&
b
)
{
#if __CUDA_ARCH__ >= 530
return
__hsub2
(
a
,
b
);
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
b1
=
__low2float
(
b
);
float
b2
=
__high2float
(
b
);
float
r1
=
a1
-
b1
;
float
r2
=
a2
-
b2
;
return
__floats2half2_rn
(
r1
,
r2
);
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pnegate
(
const
half2
&
a
)
{
#if __CUDA_ARCH__ >= 530
return
__hneg2
(
a
);
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
return
__floats2half2_rn
(
-
a1
,
-
a2
);
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pconj
(
const
half2
&
a
)
{
return
a
;
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pmul
<
half2
>
(
const
half2
&
a
,
const
half2
&
b
)
{
#if __CUDA_ARCH__ >= 530
return
__hmul2
(
a
,
b
);
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
b1
=
__low2float
(
b
);
float
b2
=
__high2float
(
b
);
float
r1
=
a1
*
b1
;
float
r2
=
a2
*
b2
;
return
__floats2half2_rn
(
r1
,
r2
);
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pmadd
<
half2
>
(
const
half2
&
a
,
const
half2
&
b
,
const
half2
&
c
)
{
#if __CUDA_ARCH__ >= 530
return
__hfma2
(
a
,
b
,
c
);
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
b1
=
__low2float
(
b
);
float
b2
=
__high2float
(
b
);
float
c1
=
__low2float
(
c
);
float
c2
=
__high2float
(
c
);
float
r1
=
a1
*
b1
+
c1
;
float
r2
=
a2
*
b2
+
c2
;
return
__floats2half2_rn
(
r1
,
r2
);
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pdiv
<
half2
>
(
const
half2
&
a
,
const
half2
&
b
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
b1
=
__low2float
(
b
);
float
b2
=
__high2float
(
b
);
float
r1
=
a1
/
b1
;
float
r2
=
a2
/
b2
;
return
__floats2half2_rn
(
r1
,
r2
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pmin
<
half2
>
(
const
half2
&
a
,
const
half2
&
b
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
b1
=
__low2float
(
b
);
float
b2
=
__high2float
(
b
);
__half
r1
=
a1
<
b1
?
__low2half
(
a
)
:
__low2half
(
b
);
__half
r2
=
a2
<
b2
?
__high2half
(
a
)
:
__high2half
(
b
);
return
__halves2half2
(
r1
,
r2
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pmax
<
half2
>
(
const
half2
&
a
,
const
half2
&
b
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
b1
=
__low2float
(
b
);
float
b2
=
__high2float
(
b
);
__half
r1
=
a1
>
b1
?
__low2half
(
a
)
:
__low2half
(
b
);
__half
r2
=
a2
>
b2
?
__high2half
(
a
)
:
__high2half
(
b
);
return
__halves2half2
(
r1
,
r2
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
Eigen
::
half
predux
<
half2
>
(
const
half2
&
a
)
{
#if __CUDA_ARCH__ >= 530
return
__hadd
(
__low2half
(
a
),
__high2half
(
a
));
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
return
Eigen
::
half
(
__float2half_rn
(
a1
+
a2
));
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
Eigen
::
half
predux_max
<
half2
>
(
const
half2
&
a
)
{
#if __CUDA_ARCH__ >= 530
__half
first
=
__low2half
(
a
);
__half
second
=
__high2half
(
a
);
return
__hgt
(
first
,
second
)
?
first
:
second
;
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
return
a1
>
a2
?
__low2half
(
a
)
:
__high2half
(
a
);
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
Eigen
::
half
predux_min
<
half2
>
(
const
half2
&
a
)
{
#if __CUDA_ARCH__ >= 530
__half
first
=
__low2half
(
a
);
__half
second
=
__high2half
(
a
);
return
__hlt
(
first
,
second
)
?
first
:
second
;
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
return
a1
<
a2
?
__low2half
(
a
)
:
__high2half
(
a
);
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
Eigen
::
half
predux_mul
<
half2
>
(
const
half2
&
a
)
{
#if __CUDA_ARCH__ >= 530
return
__hmul
(
__low2half
(
a
),
__high2half
(
a
));
#else
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
return
Eigen
::
half
(
__float2half_rn
(
a1
*
a2
));
#endif
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
plog1p
<
half2
>
(
const
half2
&
a
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
r1
=
log1pf
(
a1
);
float
r2
=
log1pf
(
a2
);
return
__floats2half2_rn
(
r1
,
r2
);
}
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
plog
<
half2
>
(
const
half2
&
a
)
{
return
h2log
(
a
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pexp
<
half2
>
(
const
half2
&
a
)
{
return
h2exp
(
a
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
psqrt
<
half2
>
(
const
half2
&
a
)
{
return
h2sqrt
(
a
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
prsqrt
<
half2
>
(
const
half2
&
a
)
{
return
h2rsqrt
(
a
);
}
#else
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
plog
<
half2
>
(
const
half2
&
a
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
r1
=
logf
(
a1
);
float
r2
=
logf
(
a2
);
return
__floats2half2_rn
(
r1
,
r2
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
pexp
<
half2
>
(
const
half2
&
a
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
r1
=
expf
(
a1
);
float
r2
=
expf
(
a2
);
return
__floats2half2_rn
(
r1
,
r2
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
psqrt
<
half2
>
(
const
half2
&
a
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
r1
=
sqrtf
(
a1
);
float
r2
=
sqrtf
(
a2
);
return
__floats2half2_rn
(
r1
,
r2
);
}
template
<
>
__device__
EIGEN_STRONG_INLINE
half2
prsqrt
<
half2
>
(
const
half2
&
a
)
{
float
a1
=
__low2float
(
a
);
float
a2
=
__high2float
(
a
);
float
r1
=
rsqrtf
(
a1
);
float
r2
=
rsqrtf
(
a2
);
return
__floats2half2_rn
(
r1
,
r2
);
}
#endif
#elif defined EIGEN_VECTORIZE_AVX512
typedef
struct
{
__m256i
x
;
}
Packet16h
;
template
<
>
struct
is_arithmetic
<
Packet16h
>
{
enum
{
value
=
true
};
};
template
<
>
struct
packet_traits
<
half
>
:
default_packet_traits
{
typedef
Packet16h
type
;
// There is no half-size packet for Packet16h.
typedef
Packet16h
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
16
,
HasHalfPacket
=
0
,
HasAdd
=
0
,
HasSub
=
0
,
HasMul
=
0
,
HasNegate
=
0
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasConj
=
0
,
HasSetLinear
=
0
,
HasDiv
=
0
,
HasSqrt
=
0
,
HasRsqrt
=
0
,
HasExp
=
0
,
HasLog
=
0
,
HasBlend
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet16h
>
{
typedef
Eigen
::
half
type
;
enum
{
size
=
16
,
alignment
=
Aligned32
};
typedef
Packet16h
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet16h
pset1
<
Packet16h
>
(
const
Eigen
::
half
&
from
)
{
Packet16h
result
;
result
.
x
=
_mm256_set1_epi16
(
from
.
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Eigen
::
half
pfirst
<
Packet16h
>
(
const
Packet16h
&
from
)
{
return
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
_mm256_extract_epi16
(
from
.
x
,
0
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet16h
pload
<
Packet16h
>
(
const
Eigen
::
half
*
from
)
{
Packet16h
result
;
result
.
x
=
_mm256_load_si256
(
reinterpret_cast
<
const
__m256i
*>
(
from
));
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet16h
ploadu
<
Packet16h
>
(
const
Eigen
::
half
*
from
)
{
Packet16h
result
;
result
.
x
=
_mm256_loadu_si256
(
reinterpret_cast
<
const
__m256i
*>
(
from
));
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
half
>
(
Eigen
::
half
*
to
,
const
Packet16h
&
from
)
{
_mm256_store_si256
((
__m256i
*
)
to
,
from
.
x
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
half
>
(
Eigen
::
half
*
to
,
const
Packet16h
&
from
)
{
_mm256_storeu_si256
((
__m256i
*
)
to
,
from
.
x
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16h
ploadquad
(
const
Eigen
::
half
*
from
)
{
Packet16h
result
;
unsigned
short
a
=
from
[
0
].
x
;
unsigned
short
b
=
from
[
1
].
x
;
unsigned
short
c
=
from
[
2
].
x
;
unsigned
short
d
=
from
[
3
].
x
;
result
.
x
=
_mm256_set_epi16
(
d
,
d
,
d
,
d
,
c
,
c
,
c
,
c
,
b
,
b
,
b
,
b
,
a
,
a
,
a
,
a
);
return
result
;
}
EIGEN_STRONG_INLINE
Packet16f
half2float
(
const
Packet16h
&
a
)
{
#ifdef EIGEN_HAS_FP16_C
return
_mm512_cvtph_ps
(
a
.
x
);
#else
EIGEN_ALIGN64
half
aux
[
16
];
pstore
(
aux
,
a
);
float
f0
(
aux
[
0
]);
float
f1
(
aux
[
1
]);
float
f2
(
aux
[
2
]);
float
f3
(
aux
[
3
]);
float
f4
(
aux
[
4
]);
float
f5
(
aux
[
5
]);
float
f6
(
aux
[
6
]);
float
f7
(
aux
[
7
]);
float
f8
(
aux
[
8
]);
float
f9
(
aux
[
9
]);
float
fa
(
aux
[
10
]);
float
fb
(
aux
[
11
]);
float
fc
(
aux
[
12
]);
float
fd
(
aux
[
13
]);
float
fe
(
aux
[
14
]);
float
ff
(
aux
[
15
]);
return
_mm512_set_ps
(
ff
,
fe
,
fd
,
fc
,
fb
,
fa
,
f9
,
f8
,
f7
,
f6
,
f5
,
f4
,
f3
,
f2
,
f1
,
f0
);
#endif
}
EIGEN_STRONG_INLINE
Packet16h
float2half
(
const
Packet16f
&
a
)
{
#ifdef EIGEN_HAS_FP16_C
Packet16h
result
;
result
.
x
=
_mm512_cvtps_ph
(
a
,
_MM_FROUND_TO_NEAREST_INT
|
_MM_FROUND_NO_EXC
);
return
result
;
#else
EIGEN_ALIGN64
float
aux
[
16
];
pstore
(
aux
,
a
);
half
h0
(
aux
[
0
]);
half
h1
(
aux
[
1
]);
half
h2
(
aux
[
2
]);
half
h3
(
aux
[
3
]);
half
h4
(
aux
[
4
]);
half
h5
(
aux
[
5
]);
half
h6
(
aux
[
6
]);
half
h7
(
aux
[
7
]);
half
h8
(
aux
[
8
]);
half
h9
(
aux
[
9
]);
half
ha
(
aux
[
10
]);
half
hb
(
aux
[
11
]);
half
hc
(
aux
[
12
]);
half
hd
(
aux
[
13
]);
half
he
(
aux
[
14
]);
half
hf
(
aux
[
15
]);
Packet16h
result
;
result
.
x
=
_mm256_set_epi16
(
hf
.
x
,
he
.
x
,
hd
.
x
,
hc
.
x
,
hb
.
x
,
ha
.
x
,
h9
.
x
,
h8
.
x
,
h7
.
x
,
h6
.
x
,
h5
.
x
,
h4
.
x
,
h3
.
x
,
h2
.
x
,
h1
.
x
,
h0
.
x
);
return
result
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet16h
padd
<
Packet16h
>
(
const
Packet16h
&
a
,
const
Packet16h
&
b
)
{
Packet16f
af
=
half2float
(
a
);
Packet16f
bf
=
half2float
(
b
);
Packet16f
rf
=
padd
(
af
,
bf
);
return
float2half
(
rf
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet16h
pmul
<
Packet16h
>
(
const
Packet16h
&
a
,
const
Packet16h
&
b
)
{
Packet16f
af
=
half2float
(
a
);
Packet16f
bf
=
half2float
(
b
);
Packet16f
rf
=
pmul
(
af
,
bf
);
return
float2half
(
rf
);
}
template
<
>
EIGEN_STRONG_INLINE
half
predux
<
Packet16h
>
(
const
Packet16h
&
from
)
{
Packet16f
from_float
=
half2float
(
from
);
return
half
(
predux
(
from_float
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet16h
pgather
<
Eigen
::
half
,
Packet16h
>
(
const
Eigen
::
half
*
from
,
Index
stride
)
{
Packet16h
result
;
result
.
x
=
_mm256_set_epi16
(
from
[
15
*
stride
].
x
,
from
[
14
*
stride
].
x
,
from
[
13
*
stride
].
x
,
from
[
12
*
stride
].
x
,
from
[
11
*
stride
].
x
,
from
[
10
*
stride
].
x
,
from
[
9
*
stride
].
x
,
from
[
8
*
stride
].
x
,
from
[
7
*
stride
].
x
,
from
[
6
*
stride
].
x
,
from
[
5
*
stride
].
x
,
from
[
4
*
stride
].
x
,
from
[
3
*
stride
].
x
,
from
[
2
*
stride
].
x
,
from
[
1
*
stride
].
x
,
from
[
0
*
stride
].
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pscatter
<
half
,
Packet16h
>
(
half
*
to
,
const
Packet16h
&
from
,
Index
stride
)
{
EIGEN_ALIGN64
half
aux
[
16
];
pstore
(
aux
,
from
);
to
[
stride
*
0
].
x
=
aux
[
0
].
x
;
to
[
stride
*
1
].
x
=
aux
[
1
].
x
;
to
[
stride
*
2
].
x
=
aux
[
2
].
x
;
to
[
stride
*
3
].
x
=
aux
[
3
].
x
;
to
[
stride
*
4
].
x
=
aux
[
4
].
x
;
to
[
stride
*
5
].
x
=
aux
[
5
].
x
;
to
[
stride
*
6
].
x
=
aux
[
6
].
x
;
to
[
stride
*
7
].
x
=
aux
[
7
].
x
;
to
[
stride
*
8
].
x
=
aux
[
8
].
x
;
to
[
stride
*
9
].
x
=
aux
[
9
].
x
;
to
[
stride
*
10
].
x
=
aux
[
10
].
x
;
to
[
stride
*
11
].
x
=
aux
[
11
].
x
;
to
[
stride
*
12
].
x
=
aux
[
12
].
x
;
to
[
stride
*
13
].
x
=
aux
[
13
].
x
;
to
[
stride
*
14
].
x
=
aux
[
14
].
x
;
to
[
stride
*
15
].
x
=
aux
[
15
].
x
;
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet16h
,
16
>&
kernel
)
{
__m256i
a
=
kernel
.
packet
[
0
].
x
;
__m256i
b
=
kernel
.
packet
[
1
].
x
;
__m256i
c
=
kernel
.
packet
[
2
].
x
;
__m256i
d
=
kernel
.
packet
[
3
].
x
;
__m256i
e
=
kernel
.
packet
[
4
].
x
;
__m256i
f
=
kernel
.
packet
[
5
].
x
;
__m256i
g
=
kernel
.
packet
[
6
].
x
;
__m256i
h
=
kernel
.
packet
[
7
].
x
;
__m256i
i
=
kernel
.
packet
[
8
].
x
;
__m256i
j
=
kernel
.
packet
[
9
].
x
;
__m256i
k
=
kernel
.
packet
[
10
].
x
;
__m256i
l
=
kernel
.
packet
[
11
].
x
;
__m256i
m
=
kernel
.
packet
[
12
].
x
;
__m256i
n
=
kernel
.
packet
[
13
].
x
;
__m256i
o
=
kernel
.
packet
[
14
].
x
;
__m256i
p
=
kernel
.
packet
[
15
].
x
;
__m256i
ab_07
=
_mm256_unpacklo_epi16
(
a
,
b
);
__m256i
cd_07
=
_mm256_unpacklo_epi16
(
c
,
d
);
__m256i
ef_07
=
_mm256_unpacklo_epi16
(
e
,
f
);
__m256i
gh_07
=
_mm256_unpacklo_epi16
(
g
,
h
);
__m256i
ij_07
=
_mm256_unpacklo_epi16
(
i
,
j
);
__m256i
kl_07
=
_mm256_unpacklo_epi16
(
k
,
l
);
__m256i
mn_07
=
_mm256_unpacklo_epi16
(
m
,
n
);
__m256i
op_07
=
_mm256_unpacklo_epi16
(
o
,
p
);
__m256i
ab_8f
=
_mm256_unpackhi_epi16
(
a
,
b
);
__m256i
cd_8f
=
_mm256_unpackhi_epi16
(
c
,
d
);
__m256i
ef_8f
=
_mm256_unpackhi_epi16
(
e
,
f
);
__m256i
gh_8f
=
_mm256_unpackhi_epi16
(
g
,
h
);
__m256i
ij_8f
=
_mm256_unpackhi_epi16
(
i
,
j
);
__m256i
kl_8f
=
_mm256_unpackhi_epi16
(
k
,
l
);
__m256i
mn_8f
=
_mm256_unpackhi_epi16
(
m
,
n
);
__m256i
op_8f
=
_mm256_unpackhi_epi16
(
o
,
p
);
__m256i
abcd_03
=
_mm256_unpacklo_epi32
(
ab_07
,
cd_07
);
__m256i
abcd_47
=
_mm256_unpackhi_epi32
(
ab_07
,
cd_07
);
__m256i
efgh_03
=
_mm256_unpacklo_epi32
(
ef_07
,
gh_07
);
__m256i
efgh_47
=
_mm256_unpackhi_epi32
(
ef_07
,
gh_07
);
__m256i
ijkl_03
=
_mm256_unpacklo_epi32
(
ij_07
,
kl_07
);
__m256i
ijkl_47
=
_mm256_unpackhi_epi32
(
ij_07
,
kl_07
);
__m256i
mnop_03
=
_mm256_unpacklo_epi32
(
mn_07
,
op_07
);
__m256i
mnop_47
=
_mm256_unpackhi_epi32
(
mn_07
,
op_07
);
__m256i
abcd_8b
=
_mm256_unpacklo_epi32
(
ab_8f
,
cd_8f
);
__m256i
abcd_cf
=
_mm256_unpackhi_epi32
(
ab_8f
,
cd_8f
);
__m256i
efgh_8b
=
_mm256_unpacklo_epi32
(
ef_8f
,
gh_8f
);
__m256i
efgh_cf
=
_mm256_unpackhi_epi32
(
ef_8f
,
gh_8f
);
__m256i
ijkl_8b
=
_mm256_unpacklo_epi32
(
ij_8f
,
kl_8f
);
__m256i
ijkl_cf
=
_mm256_unpackhi_epi32
(
ij_8f
,
kl_8f
);
__m256i
mnop_8b
=
_mm256_unpacklo_epi32
(
mn_8f
,
op_8f
);
__m256i
mnop_cf
=
_mm256_unpackhi_epi32
(
mn_8f
,
op_8f
);
__m256i
abcdefgh_01
=
_mm256_unpacklo_epi64
(
abcd_03
,
efgh_03
);
__m256i
abcdefgh_23
=
_mm256_unpackhi_epi64
(
abcd_03
,
efgh_03
);
__m256i
ijklmnop_01
=
_mm256_unpacklo_epi64
(
ijkl_03
,
mnop_03
);
__m256i
ijklmnop_23
=
_mm256_unpackhi_epi64
(
ijkl_03
,
mnop_03
);
__m256i
abcdefgh_45
=
_mm256_unpacklo_epi64
(
abcd_47
,
efgh_47
);
__m256i
abcdefgh_67
=
_mm256_unpackhi_epi64
(
abcd_47
,
efgh_47
);
__m256i
ijklmnop_45
=
_mm256_unpacklo_epi64
(
ijkl_47
,
mnop_47
);
__m256i
ijklmnop_67
=
_mm256_unpackhi_epi64
(
ijkl_47
,
mnop_47
);
__m256i
abcdefgh_89
=
_mm256_unpacklo_epi64
(
abcd_8b
,
efgh_8b
);
__m256i
abcdefgh_ab
=
_mm256_unpackhi_epi64
(
abcd_8b
,
efgh_8b
);
__m256i
ijklmnop_89
=
_mm256_unpacklo_epi64
(
ijkl_8b
,
mnop_8b
);
__m256i
ijklmnop_ab
=
_mm256_unpackhi_epi64
(
ijkl_8b
,
mnop_8b
);
__m256i
abcdefgh_cd
=
_mm256_unpacklo_epi64
(
abcd_cf
,
efgh_cf
);
__m256i
abcdefgh_ef
=
_mm256_unpackhi_epi64
(
abcd_cf
,
efgh_cf
);
__m256i
ijklmnop_cd
=
_mm256_unpacklo_epi64
(
ijkl_cf
,
mnop_cf
);
__m256i
ijklmnop_ef
=
_mm256_unpackhi_epi64
(
ijkl_cf
,
mnop_cf
);
// NOTE: no unpacklo/hi instr in this case, so using permute instr.
__m256i
a_p_0
=
_mm256_permute2x128_si256
(
abcdefgh_01
,
ijklmnop_01
,
0x20
);
__m256i
a_p_1
=
_mm256_permute2x128_si256
(
abcdefgh_01
,
ijklmnop_01
,
0x31
);
__m256i
a_p_2
=
_mm256_permute2x128_si256
(
abcdefgh_23
,
ijklmnop_23
,
0x20
);
__m256i
a_p_3
=
_mm256_permute2x128_si256
(
abcdefgh_23
,
ijklmnop_23
,
0x31
);
__m256i
a_p_4
=
_mm256_permute2x128_si256
(
abcdefgh_45
,
ijklmnop_45
,
0x20
);
__m256i
a_p_5
=
_mm256_permute2x128_si256
(
abcdefgh_45
,
ijklmnop_45
,
0x31
);
__m256i
a_p_6
=
_mm256_permute2x128_si256
(
abcdefgh_67
,
ijklmnop_67
,
0x20
);
__m256i
a_p_7
=
_mm256_permute2x128_si256
(
abcdefgh_67
,
ijklmnop_67
,
0x31
);
__m256i
a_p_8
=
_mm256_permute2x128_si256
(
abcdefgh_89
,
ijklmnop_89
,
0x20
);
__m256i
a_p_9
=
_mm256_permute2x128_si256
(
abcdefgh_89
,
ijklmnop_89
,
0x31
);
__m256i
a_p_a
=
_mm256_permute2x128_si256
(
abcdefgh_ab
,
ijklmnop_ab
,
0x20
);
__m256i
a_p_b
=
_mm256_permute2x128_si256
(
abcdefgh_ab
,
ijklmnop_ab
,
0x31
);
__m256i
a_p_c
=
_mm256_permute2x128_si256
(
abcdefgh_cd
,
ijklmnop_cd
,
0x20
);
__m256i
a_p_d
=
_mm256_permute2x128_si256
(
abcdefgh_cd
,
ijklmnop_cd
,
0x31
);
__m256i
a_p_e
=
_mm256_permute2x128_si256
(
abcdefgh_ef
,
ijklmnop_ef
,
0x20
);
__m256i
a_p_f
=
_mm256_permute2x128_si256
(
abcdefgh_ef
,
ijklmnop_ef
,
0x31
);
kernel
.
packet
[
0
].
x
=
a_p_0
;
kernel
.
packet
[
1
].
x
=
a_p_1
;
kernel
.
packet
[
2
].
x
=
a_p_2
;
kernel
.
packet
[
3
].
x
=
a_p_3
;
kernel
.
packet
[
4
].
x
=
a_p_4
;
kernel
.
packet
[
5
].
x
=
a_p_5
;
kernel
.
packet
[
6
].
x
=
a_p_6
;
kernel
.
packet
[
7
].
x
=
a_p_7
;
kernel
.
packet
[
8
].
x
=
a_p_8
;
kernel
.
packet
[
9
].
x
=
a_p_9
;
kernel
.
packet
[
10
].
x
=
a_p_a
;
kernel
.
packet
[
11
].
x
=
a_p_b
;
kernel
.
packet
[
12
].
x
=
a_p_c
;
kernel
.
packet
[
13
].
x
=
a_p_d
;
kernel
.
packet
[
14
].
x
=
a_p_e
;
kernel
.
packet
[
15
].
x
=
a_p_f
;
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet16h
,
8
>&
kernel
)
{
EIGEN_ALIGN64
half
in
[
8
][
16
];
pstore
<
half
>
(
in
[
0
],
kernel
.
packet
[
0
]);
pstore
<
half
>
(
in
[
1
],
kernel
.
packet
[
1
]);
pstore
<
half
>
(
in
[
2
],
kernel
.
packet
[
2
]);
pstore
<
half
>
(
in
[
3
],
kernel
.
packet
[
3
]);
pstore
<
half
>
(
in
[
4
],
kernel
.
packet
[
4
]);
pstore
<
half
>
(
in
[
5
],
kernel
.
packet
[
5
]);
pstore
<
half
>
(
in
[
6
],
kernel
.
packet
[
6
]);
pstore
<
half
>
(
in
[
7
],
kernel
.
packet
[
7
]);
EIGEN_ALIGN64
half
out
[
8
][
16
];
for
(
int
i
=
0
;
i
<
8
;
++
i
)
{
for
(
int
j
=
0
;
j
<
8
;
++
j
)
{
out
[
i
][
j
]
=
in
[
j
][
2
*
i
];
}
for
(
int
j
=
0
;
j
<
8
;
++
j
)
{
out
[
i
][
j
+
8
]
=
in
[
j
][
2
*
i
+
1
];
}
}
kernel
.
packet
[
0
]
=
pload
<
Packet16h
>
(
out
[
0
]);
kernel
.
packet
[
1
]
=
pload
<
Packet16h
>
(
out
[
1
]);
kernel
.
packet
[
2
]
=
pload
<
Packet16h
>
(
out
[
2
]);
kernel
.
packet
[
3
]
=
pload
<
Packet16h
>
(
out
[
3
]);
kernel
.
packet
[
4
]
=
pload
<
Packet16h
>
(
out
[
4
]);
kernel
.
packet
[
5
]
=
pload
<
Packet16h
>
(
out
[
5
]);
kernel
.
packet
[
6
]
=
pload
<
Packet16h
>
(
out
[
6
]);
kernel
.
packet
[
7
]
=
pload
<
Packet16h
>
(
out
[
7
]);
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet16h
,
4
>&
kernel
)
{
EIGEN_ALIGN64
half
in
[
4
][
16
];
pstore
<
half
>
(
in
[
0
],
kernel
.
packet
[
0
]);
pstore
<
half
>
(
in
[
1
],
kernel
.
packet
[
1
]);
pstore
<
half
>
(
in
[
2
],
kernel
.
packet
[
2
]);
pstore
<
half
>
(
in
[
3
],
kernel
.
packet
[
3
]);
EIGEN_ALIGN64
half
out
[
4
][
16
];
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
out
[
i
][
j
]
=
in
[
j
][
4
*
i
];
}
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
out
[
i
][
j
+
4
]
=
in
[
j
][
4
*
i
+
1
];
}
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
out
[
i
][
j
+
8
]
=
in
[
j
][
4
*
i
+
2
];
}
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
out
[
i
][
j
+
12
]
=
in
[
j
][
4
*
i
+
3
];
}
}
kernel
.
packet
[
0
]
=
pload
<
Packet16h
>
(
out
[
0
]);
kernel
.
packet
[
1
]
=
pload
<
Packet16h
>
(
out
[
1
]);
kernel
.
packet
[
2
]
=
pload
<
Packet16h
>
(
out
[
2
]);
kernel
.
packet
[
3
]
=
pload
<
Packet16h
>
(
out
[
3
]);
}
#elif defined EIGEN_VECTORIZE_AVX
typedef
struct
{
__m128i
x
;
}
Packet8h
;
template
<
>
struct
is_arithmetic
<
Packet8h
>
{
enum
{
value
=
true
};
};
template
<
>
struct
packet_traits
<
Eigen
::
half
>
:
default_packet_traits
{
typedef
Packet8h
type
;
// There is no half-size packet for Packet8h.
typedef
Packet8h
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
8
,
HasHalfPacket
=
0
,
HasAdd
=
0
,
HasSub
=
0
,
HasMul
=
0
,
HasNegate
=
0
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasConj
=
0
,
HasSetLinear
=
0
,
HasDiv
=
0
,
HasSqrt
=
0
,
HasRsqrt
=
0
,
HasExp
=
0
,
HasLog
=
0
,
HasBlend
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet8h
>
{
typedef
Eigen
::
half
type
;
enum
{
size
=
8
,
alignment
=
Aligned16
};
typedef
Packet8h
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet8h
pset1
<
Packet8h
>
(
const
Eigen
::
half
&
from
)
{
Packet8h
result
;
result
.
x
=
_mm_set1_epi16
(
from
.
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Eigen
::
half
pfirst
<
Packet8h
>
(
const
Packet8h
&
from
)
{
return
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
_mm_extract_epi16
(
from
.
x
,
0
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet8h
pload
<
Packet8h
>
(
const
Eigen
::
half
*
from
)
{
Packet8h
result
;
result
.
x
=
_mm_load_si128
(
reinterpret_cast
<
const
__m128i
*>
(
from
));
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet8h
ploadu
<
Packet8h
>
(
const
Eigen
::
half
*
from
)
{
Packet8h
result
;
result
.
x
=
_mm_loadu_si128
(
reinterpret_cast
<
const
__m128i
*>
(
from
));
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
Eigen
::
half
>
(
Eigen
::
half
*
to
,
const
Packet8h
&
from
)
{
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
to
),
from
.
x
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
Eigen
::
half
>
(
Eigen
::
half
*
to
,
const
Packet8h
&
from
)
{
_mm_storeu_si128
(
reinterpret_cast
<
__m128i
*>
(
to
),
from
.
x
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8h
ploadquad
<
Packet8h
>
(
const
Eigen
::
half
*
from
)
{
Packet8h
result
;
unsigned
short
a
=
from
[
0
].
x
;
unsigned
short
b
=
from
[
1
].
x
;
result
.
x
=
_mm_set_epi16
(
b
,
b
,
b
,
b
,
a
,
a
,
a
,
a
);
return
result
;
}
EIGEN_STRONG_INLINE
Packet8f
half2float
(
const
Packet8h
&
a
)
{
#ifdef EIGEN_HAS_FP16_C
return
_mm256_cvtph_ps
(
a
.
x
);
#else
EIGEN_ALIGN32
Eigen
::
half
aux
[
8
];
pstore
(
aux
,
a
);
float
f0
(
aux
[
0
]);
float
f1
(
aux
[
1
]);
float
f2
(
aux
[
2
]);
float
f3
(
aux
[
3
]);
float
f4
(
aux
[
4
]);
float
f5
(
aux
[
5
]);
float
f6
(
aux
[
6
]);
float
f7
(
aux
[
7
]);
return
_mm256_set_ps
(
f7
,
f6
,
f5
,
f4
,
f3
,
f2
,
f1
,
f0
);
#endif
}
EIGEN_STRONG_INLINE
Packet8h
float2half
(
const
Packet8f
&
a
)
{
#ifdef EIGEN_HAS_FP16_C
Packet8h
result
;
result
.
x
=
_mm256_cvtps_ph
(
a
,
_MM_FROUND_TO_NEAREST_INT
|
_MM_FROUND_NO_EXC
);
return
result
;
#else
EIGEN_ALIGN32
float
aux
[
8
];
pstore
(
aux
,
a
);
Eigen
::
half
h0
(
aux
[
0
]);
Eigen
::
half
h1
(
aux
[
1
]);
Eigen
::
half
h2
(
aux
[
2
]);
Eigen
::
half
h3
(
aux
[
3
]);
Eigen
::
half
h4
(
aux
[
4
]);
Eigen
::
half
h5
(
aux
[
5
]);
Eigen
::
half
h6
(
aux
[
6
]);
Eigen
::
half
h7
(
aux
[
7
]);
Packet8h
result
;
result
.
x
=
_mm_set_epi16
(
h7
.
x
,
h6
.
x
,
h5
.
x
,
h4
.
x
,
h3
.
x
,
h2
.
x
,
h1
.
x
,
h0
.
x
);
return
result
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet8h
pconj
(
const
Packet8h
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet8h
padd
<
Packet8h
>
(
const
Packet8h
&
a
,
const
Packet8h
&
b
)
{
Packet8f
af
=
half2float
(
a
);
Packet8f
bf
=
half2float
(
b
);
Packet8f
rf
=
padd
(
af
,
bf
);
return
float2half
(
rf
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8h
pmul
<
Packet8h
>
(
const
Packet8h
&
a
,
const
Packet8h
&
b
)
{
Packet8f
af
=
half2float
(
a
);
Packet8f
bf
=
half2float
(
b
);
Packet8f
rf
=
pmul
(
af
,
bf
);
return
float2half
(
rf
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet8h
pgather
<
Eigen
::
half
,
Packet8h
>
(
const
Eigen
::
half
*
from
,
Index
stride
)
{
Packet8h
result
;
result
.
x
=
_mm_set_epi16
(
from
[
7
*
stride
].
x
,
from
[
6
*
stride
].
x
,
from
[
5
*
stride
].
x
,
from
[
4
*
stride
].
x
,
from
[
3
*
stride
].
x
,
from
[
2
*
stride
].
x
,
from
[
1
*
stride
].
x
,
from
[
0
*
stride
].
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pscatter
<
Eigen
::
half
,
Packet8h
>
(
Eigen
::
half
*
to
,
const
Packet8h
&
from
,
Index
stride
)
{
EIGEN_ALIGN32
Eigen
::
half
aux
[
8
];
pstore
(
aux
,
from
);
to
[
stride
*
0
].
x
=
aux
[
0
].
x
;
to
[
stride
*
1
].
x
=
aux
[
1
].
x
;
to
[
stride
*
2
].
x
=
aux
[
2
].
x
;
to
[
stride
*
3
].
x
=
aux
[
3
].
x
;
to
[
stride
*
4
].
x
=
aux
[
4
].
x
;
to
[
stride
*
5
].
x
=
aux
[
5
].
x
;
to
[
stride
*
6
].
x
=
aux
[
6
].
x
;
to
[
stride
*
7
].
x
=
aux
[
7
].
x
;
}
template
<
>
EIGEN_STRONG_INLINE
Eigen
::
half
predux
<
Packet8h
>
(
const
Packet8h
&
a
)
{
Packet8f
af
=
half2float
(
a
);
float
reduced
=
predux
<
Packet8f
>
(
af
);
return
Eigen
::
half
(
reduced
);
}
template
<
>
EIGEN_STRONG_INLINE
Eigen
::
half
predux_max
<
Packet8h
>
(
const
Packet8h
&
a
)
{
Packet8f
af
=
half2float
(
a
);
float
reduced
=
predux_max
<
Packet8f
>
(
af
);
return
Eigen
::
half
(
reduced
);
}
template
<
>
EIGEN_STRONG_INLINE
Eigen
::
half
predux_min
<
Packet8h
>
(
const
Packet8h
&
a
)
{
Packet8f
af
=
half2float
(
a
);
float
reduced
=
predux_min
<
Packet8f
>
(
af
);
return
Eigen
::
half
(
reduced
);
}
template
<
>
EIGEN_STRONG_INLINE
Eigen
::
half
predux_mul
<
Packet8h
>
(
const
Packet8h
&
a
)
{
Packet8f
af
=
half2float
(
a
);
float
reduced
=
predux_mul
<
Packet8f
>
(
af
);
return
Eigen
::
half
(
reduced
);
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet8h
,
8
>&
kernel
)
{
__m128i
a
=
kernel
.
packet
[
0
].
x
;
__m128i
b
=
kernel
.
packet
[
1
].
x
;
__m128i
c
=
kernel
.
packet
[
2
].
x
;
__m128i
d
=
kernel
.
packet
[
3
].
x
;
__m128i
e
=
kernel
.
packet
[
4
].
x
;
__m128i
f
=
kernel
.
packet
[
5
].
x
;
__m128i
g
=
kernel
.
packet
[
6
].
x
;
__m128i
h
=
kernel
.
packet
[
7
].
x
;
__m128i
a03b03
=
_mm_unpacklo_epi16
(
a
,
b
);
__m128i
c03d03
=
_mm_unpacklo_epi16
(
c
,
d
);
__m128i
e03f03
=
_mm_unpacklo_epi16
(
e
,
f
);
__m128i
g03h03
=
_mm_unpacklo_epi16
(
g
,
h
);
__m128i
a47b47
=
_mm_unpackhi_epi16
(
a
,
b
);
__m128i
c47d47
=
_mm_unpackhi_epi16
(
c
,
d
);
__m128i
e47f47
=
_mm_unpackhi_epi16
(
e
,
f
);
__m128i
g47h47
=
_mm_unpackhi_epi16
(
g
,
h
);
__m128i
a01b01c01d01
=
_mm_unpacklo_epi32
(
a03b03
,
c03d03
);
__m128i
a23b23c23d23
=
_mm_unpackhi_epi32
(
a03b03
,
c03d03
);
__m128i
e01f01g01h01
=
_mm_unpacklo_epi32
(
e03f03
,
g03h03
);
__m128i
e23f23g23h23
=
_mm_unpackhi_epi32
(
e03f03
,
g03h03
);
__m128i
a45b45c45d45
=
_mm_unpacklo_epi32
(
a47b47
,
c47d47
);
__m128i
a67b67c67d67
=
_mm_unpackhi_epi32
(
a47b47
,
c47d47
);
__m128i
e45f45g45h45
=
_mm_unpacklo_epi32
(
e47f47
,
g47h47
);
__m128i
e67f67g67h67
=
_mm_unpackhi_epi32
(
e47f47
,
g47h47
);
__m128i
a0b0c0d0e0f0g0h0
=
_mm_unpacklo_epi64
(
a01b01c01d01
,
e01f01g01h01
);
__m128i
a1b1c1d1e1f1g1h1
=
_mm_unpackhi_epi64
(
a01b01c01d01
,
e01f01g01h01
);
__m128i
a2b2c2d2e2f2g2h2
=
_mm_unpacklo_epi64
(
a23b23c23d23
,
e23f23g23h23
);
__m128i
a3b3c3d3e3f3g3h3
=
_mm_unpackhi_epi64
(
a23b23c23d23
,
e23f23g23h23
);
__m128i
a4b4c4d4e4f4g4h4
=
_mm_unpacklo_epi64
(
a45b45c45d45
,
e45f45g45h45
);
__m128i
a5b5c5d5e5f5g5h5
=
_mm_unpackhi_epi64
(
a45b45c45d45
,
e45f45g45h45
);
__m128i
a6b6c6d6e6f6g6h6
=
_mm_unpacklo_epi64
(
a67b67c67d67
,
e67f67g67h67
);
__m128i
a7b7c7d7e7f7g7h7
=
_mm_unpackhi_epi64
(
a67b67c67d67
,
e67f67g67h67
);
kernel
.
packet
[
0
].
x
=
a0b0c0d0e0f0g0h0
;
kernel
.
packet
[
1
].
x
=
a1b1c1d1e1f1g1h1
;
kernel
.
packet
[
2
].
x
=
a2b2c2d2e2f2g2h2
;
kernel
.
packet
[
3
].
x
=
a3b3c3d3e3f3g3h3
;
kernel
.
packet
[
4
].
x
=
a4b4c4d4e4f4g4h4
;
kernel
.
packet
[
5
].
x
=
a5b5c5d5e5f5g5h5
;
kernel
.
packet
[
6
].
x
=
a6b6c6d6e6f6g6h6
;
kernel
.
packet
[
7
].
x
=
a7b7c7d7e7f7g7h7
;
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet8h
,
4
>&
kernel
)
{
EIGEN_ALIGN32
Eigen
::
half
in
[
4
][
8
];
pstore
<
Eigen
::
half
>
(
in
[
0
],
kernel
.
packet
[
0
]);
pstore
<
Eigen
::
half
>
(
in
[
1
],
kernel
.
packet
[
1
]);
pstore
<
Eigen
::
half
>
(
in
[
2
],
kernel
.
packet
[
2
]);
pstore
<
Eigen
::
half
>
(
in
[
3
],
kernel
.
packet
[
3
]);
EIGEN_ALIGN32
Eigen
::
half
out
[
4
][
8
];
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
out
[
i
][
j
]
=
in
[
j
][
2
*
i
];
}
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
out
[
i
][
j
+
4
]
=
in
[
j
][
2
*
i
+
1
];
}
}
kernel
.
packet
[
0
]
=
pload
<
Packet8h
>
(
out
[
0
]);
kernel
.
packet
[
1
]
=
pload
<
Packet8h
>
(
out
[
1
]);
kernel
.
packet
[
2
]
=
pload
<
Packet8h
>
(
out
[
2
]);
kernel
.
packet
[
3
]
=
pload
<
Packet8h
>
(
out
[
3
]);
}
// Disable the following code since it's broken on too many platforms / compilers.
//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
#elif 0
typedef
struct
{
__m64
x
;
}
Packet4h
;
template
<
>
struct
is_arithmetic
<
Packet4h
>
{
enum
{
value
=
true
};
};
template
<
>
struct
packet_traits
<
Eigen
::
half
>
:
default_packet_traits
{
typedef
Packet4h
type
;
// There is no half-size packet for Packet4h.
typedef
Packet4h
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
,
HasHalfPacket
=
0
,
HasAdd
=
0
,
HasSub
=
0
,
HasMul
=
0
,
HasNegate
=
0
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasConj
=
0
,
HasSetLinear
=
0
,
HasDiv
=
0
,
HasSqrt
=
0
,
HasRsqrt
=
0
,
HasExp
=
0
,
HasLog
=
0
,
HasBlend
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet4h
>
{
typedef
Eigen
::
half
type
;
enum
{
size
=
4
,
alignment
=
Aligned16
};
typedef
Packet4h
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet4h
pset1
<
Packet4h
>
(
const
Eigen
::
half
&
from
)
{
Packet4h
result
;
result
.
x
=
_mm_set1_pi16
(
from
.
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Eigen
::
half
pfirst
<
Packet4h
>
(
const
Packet4h
&
from
)
{
return
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
_mm_cvtsi64_si32
(
from
.
x
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet4h
pconj
(
const
Packet4h
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4h
padd
<
Packet4h
>
(
const
Packet4h
&
a
,
const
Packet4h
&
b
)
{
__int64_t
a64
=
_mm_cvtm64_si64
(
a
.
x
);
__int64_t
b64
=
_mm_cvtm64_si64
(
b
.
x
);
Eigen
::
half
h
[
4
];
Eigen
::
half
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
));
Eigen
::
half
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
));
h
[
0
]
=
ha
+
hb
;
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
16
));
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
>>
16
));
h
[
1
]
=
ha
+
hb
;
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
32
));
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
>>
32
));
h
[
2
]
=
ha
+
hb
;
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
48
));
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
>>
48
));
h
[
3
]
=
ha
+
hb
;
Packet4h
result
;
result
.
x
=
_mm_set_pi16
(
h
[
3
].
x
,
h
[
2
].
x
,
h
[
1
].
x
,
h
[
0
].
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4h
pmul
<
Packet4h
>
(
const
Packet4h
&
a
,
const
Packet4h
&
b
)
{
__int64_t
a64
=
_mm_cvtm64_si64
(
a
.
x
);
__int64_t
b64
=
_mm_cvtm64_si64
(
b
.
x
);
Eigen
::
half
h
[
4
];
Eigen
::
half
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
));
Eigen
::
half
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
));
h
[
0
]
=
ha
*
hb
;
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
16
));
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
>>
16
));
h
[
1
]
=
ha
*
hb
;
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
32
));
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
>>
32
));
h
[
2
]
=
ha
*
hb
;
ha
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
48
));
hb
=
half_impl
::
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
b64
>>
48
));
h
[
3
]
=
ha
*
hb
;
Packet4h
result
;
result
.
x
=
_mm_set_pi16
(
h
[
3
].
x
,
h
[
2
].
x
,
h
[
1
].
x
,
h
[
0
].
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4h
pload
<
Packet4h
>
(
const
Eigen
::
half
*
from
)
{
Packet4h
result
;
result
.
x
=
_mm_cvtsi64_m64
(
*
reinterpret_cast
<
const
__int64_t
*>
(
from
));
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4h
ploadu
<
Packet4h
>
(
const
Eigen
::
half
*
from
)
{
Packet4h
result
;
result
.
x
=
_mm_cvtsi64_m64
(
*
reinterpret_cast
<
const
__int64_t
*>
(
from
));
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
Eigen
::
half
>
(
Eigen
::
half
*
to
,
const
Packet4h
&
from
)
{
__int64_t
r
=
_mm_cvtm64_si64
(
from
.
x
);
*
(
reinterpret_cast
<
__int64_t
*>
(
to
))
=
r
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
Eigen
::
half
>
(
Eigen
::
half
*
to
,
const
Packet4h
&
from
)
{
__int64_t
r
=
_mm_cvtm64_si64
(
from
.
x
);
*
(
reinterpret_cast
<
__int64_t
*>
(
to
))
=
r
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4h
ploadquad
<
Packet4h
>
(
const
Eigen
::
half
*
from
)
{
return
pset1
<
Packet4h
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4h
pgather
<
Eigen
::
half
,
Packet4h
>
(
const
Eigen
::
half
*
from
,
Index
stride
)
{
Packet4h
result
;
result
.
x
=
_mm_set_pi16
(
from
[
3
*
stride
].
x
,
from
[
2
*
stride
].
x
,
from
[
1
*
stride
].
x
,
from
[
0
*
stride
].
x
);
return
result
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pscatter
<
Eigen
::
half
,
Packet4h
>
(
Eigen
::
half
*
to
,
const
Packet4h
&
from
,
Index
stride
)
{
__int64_t
a
=
_mm_cvtm64_si64
(
from
.
x
);
to
[
stride
*
0
].
x
=
static_cast
<
unsigned
short
>
(
a
);
to
[
stride
*
1
].
x
=
static_cast
<
unsigned
short
>
(
a
>>
16
);
to
[
stride
*
2
].
x
=
static_cast
<
unsigned
short
>
(
a
>>
32
);
to
[
stride
*
3
].
x
=
static_cast
<
unsigned
short
>
(
a
>>
48
);
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet4h
,
4
>&
kernel
)
{
__m64
T0
=
_mm_unpacklo_pi16
(
kernel
.
packet
[
0
].
x
,
kernel
.
packet
[
1
].
x
);
__m64
T1
=
_mm_unpacklo_pi16
(
kernel
.
packet
[
2
].
x
,
kernel
.
packet
[
3
].
x
);
__m64
T2
=
_mm_unpackhi_pi16
(
kernel
.
packet
[
0
].
x
,
kernel
.
packet
[
1
].
x
);
__m64
T3
=
_mm_unpackhi_pi16
(
kernel
.
packet
[
2
].
x
,
kernel
.
packet
[
3
].
x
);
kernel
.
packet
[
0
].
x
=
_mm_unpacklo_pi32
(
T0
,
T1
);
kernel
.
packet
[
1
].
x
=
_mm_unpackhi_pi32
(
T0
,
T1
);
kernel
.
packet
[
2
].
x
=
_mm_unpacklo_pi32
(
T2
,
T3
);
kernel
.
packet
[
3
].
x
=
_mm_unpackhi_pi32
(
T2
,
T3
);
}
#endif
}
}
#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/CUDA/TypeCasting.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TYPE_CASTING_CUDA_H
#define EIGEN_TYPE_CASTING_CUDA_H
namespace
Eigen
{
namespace
internal
{
template
<
>
struct
scalar_cast_op
<
float
,
Eigen
::
half
>
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cast_op
)
typedef
Eigen
::
half
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
Eigen
::
half
operator
()
(
const
float
&
a
)
const
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
return
__float2half
(
a
);
#else
return
Eigen
::
half
(
a
);
#endif
}
};
template
<
>
struct
functor_traits
<
scalar_cast_op
<
float
,
Eigen
::
half
>
>
{
enum
{
Cost
=
NumTraits
<
float
>::
AddCost
,
PacketAccess
=
false
};
};
template
<
>
struct
scalar_cast_op
<
int
,
Eigen
::
half
>
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cast_op
)
typedef
Eigen
::
half
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
Eigen
::
half
operator
()
(
const
int
&
a
)
const
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
return
__float2half
(
static_cast
<
float
>
(
a
));
#else
return
Eigen
::
half
(
static_cast
<
float
>
(
a
));
#endif
}
};
template
<
>
struct
functor_traits
<
scalar_cast_op
<
int
,
Eigen
::
half
>
>
{
enum
{
Cost
=
NumTraits
<
float
>::
AddCost
,
PacketAccess
=
false
};
};
template
<
>
struct
scalar_cast_op
<
Eigen
::
half
,
float
>
{
EIGEN_EMPTY_STRUCT_CTOR
(
scalar_cast_op
)
typedef
float
result_type
;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float
operator
()
(
const
Eigen
::
half
&
a
)
const
{
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
return
__half2float
(
a
);
#else
return
static_cast
<
float
>
(
a
);
#endif
}
};
template
<
>
struct
functor_traits
<
scalar_cast_op
<
Eigen
::
half
,
float
>
>
{
enum
{
Cost
=
NumTraits
<
float
>::
AddCost
,
PacketAccess
=
false
};
};
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
template
<
>
struct
type_casting_traits
<
Eigen
::
half
,
float
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
2
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
float4
pcast
<
half2
,
float4
>
(
const
half2
&
a
,
const
half2
&
b
)
{
float2
r1
=
__half22float2
(
a
);
float2
r2
=
__half22float2
(
b
);
return
make_float4
(
r1
.
x
,
r1
.
y
,
r2
.
x
,
r2
.
y
);
}
template
<
>
struct
type_casting_traits
<
float
,
Eigen
::
half
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
2
};
};
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
half2
pcast
<
float4
,
half2
>
(
const
float4
&
a
)
{
// Simply discard the second half of the input
return
__floats2half2_rn
(
a
.
x
,
a
.
y
);
}
#elif defined EIGEN_VECTORIZE_AVX512
template
<
>
struct
type_casting_traits
<
half
,
float
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet16f
pcast
<
Packet16h
,
Packet16f
>
(
const
Packet16h
&
a
)
{
return
half2float
(
a
);
}
template
<
>
struct
type_casting_traits
<
float
,
half
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet16h
pcast
<
Packet16f
,
Packet16h
>
(
const
Packet16f
&
a
)
{
return
float2half
(
a
);
}
#elif defined EIGEN_VECTORIZE_AVX
template
<
>
struct
type_casting_traits
<
Eigen
::
half
,
float
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet8f
pcast
<
Packet8h
,
Packet8f
>
(
const
Packet8h
&
a
)
{
return
half2float
(
a
);
}
template
<
>
struct
type_casting_traits
<
float
,
Eigen
::
half
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet8h
pcast
<
Packet8f
,
Packet8h
>
(
const
Packet8f
&
a
)
{
return
float2half
(
a
);
}
// Disable the following code since it's broken on too many platforms / compilers.
//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
#elif 0
template
<
>
struct
type_casting_traits
<
Eigen
::
half
,
float
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pcast
<
Packet4h
,
Packet4f
>
(
const
Packet4h
&
a
)
{
__int64_t
a64
=
_mm_cvtm64_si64
(
a
.
x
);
Eigen
::
half
h
=
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
));
float
f1
=
static_cast
<
float
>
(
h
);
h
=
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
16
));
float
f2
=
static_cast
<
float
>
(
h
);
h
=
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
32
));
float
f3
=
static_cast
<
float
>
(
h
);
h
=
raw_uint16_to_half
(
static_cast
<
unsigned
short
>
(
a64
>>
48
));
float
f4
=
static_cast
<
float
>
(
h
);
return
_mm_set_ps
(
f4
,
f3
,
f2
,
f1
);
}
template
<
>
struct
type_casting_traits
<
float
,
Eigen
::
half
>
{
enum
{
VectorizedCast
=
1
,
SrcCoeffRatio
=
1
,
TgtCoeffRatio
=
1
};
};
template
<
>
EIGEN_STRONG_INLINE
Packet4h
pcast
<
Packet4f
,
Packet4h
>
(
const
Packet4f
&
a
)
{
EIGEN_ALIGN16
float
aux
[
4
];
pstore
(
aux
,
a
);
Eigen
::
half
h0
(
aux
[
0
]);
Eigen
::
half
h1
(
aux
[
1
]);
Eigen
::
half
h2
(
aux
[
2
]);
Eigen
::
half
h3
(
aux
[
3
]);
Packet4h
result
;
result
.
x
=
_mm_set_pi16
(
h3
.
x
,
h2
.
x
,
h1
.
x
,
h0
.
x
);
return
result
;
}
#endif
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_TYPE_CASTING_CUDA_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/Default/ConjHelper.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_ARCH_CONJ_HELPER_H
#define EIGEN_ARCH_CONJ_HELPER_H
#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \
template<> struct conj_helper<PACKET_REAL, PACKET_CPLX, false,false> { \
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \
{ return padd(c, pmul(x,y)); } \
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \
{ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); } \
}; \
\
template<> struct conj_helper<PACKET_CPLX, PACKET_REAL, false,false> { \
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \
{ return padd(c, pmul(x,y)); } \
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \
{ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); } \
};
#endif // EIGEN_ARCH_CONJ_HELPER_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/NEON/Complex.h
View file @
13b115ab
...
...
@@ -2,6 +2,7 @@
// for linear algebra.
//
// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
...
...
@@ -14,8 +15,21 @@ namespace Eigen {
namespace
internal
{
static
uint32x4_t
p4ui_CONJ_XOR
=
EIGEN_INIT_NEON_PACKET4
(
0x00000000
,
0x80000000
,
0x00000000
,
0x80000000
);
static
uint32x2_t
p2ui_CONJ_XOR
=
EIGEN_INIT_NEON_PACKET2
(
0x00000000
,
0x80000000
);
inline
uint32x4_t
p4ui_CONJ_XOR
()
{
// See bug 1325, clang fails to call vld1q_u64.
#if EIGEN_COMP_CLANG
uint32x4_t
ret
=
{
0x00000000
,
0x80000000
,
0x00000000
,
0x80000000
};
return
ret
;
#else
static
const
uint32_t
conj_XOR_DATA
[]
=
{
0x00000000
,
0x80000000
,
0x00000000
,
0x80000000
};
return
vld1q_u32
(
conj_XOR_DATA
);
#endif
}
inline
uint32x2_t
p2ui_CONJ_XOR
()
{
static
const
uint32_t
conj_XOR_DATA
[]
=
{
0x00000000
,
0x80000000
};
return
vld1_u32
(
conj_XOR_DATA
);
}
//---------- float ----------
struct
Packet2cf
...
...
@@ -28,10 +42,12 @@ struct Packet2cf
template
<
>
struct
packet_traits
<
std
::
complex
<
float
>
>
:
default_packet_traits
{
typedef
Packet2cf
type
;
typedef
Packet2cf
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
...
...
@@ -46,12 +62,12 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
};
template
<
>
struct
unpacket_traits
<
Packet2cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
2
}
;
};
template
<
>
struct
unpacket_traits
<
Packet2cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
Packet2cf
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pset1
<
Packet2cf
>
(
const
std
::
complex
<
float
>&
from
)
{
float32x2_t
r64
;
r64
=
vld1_f32
((
float
*
)
&
from
);
r64
=
vld1_f32
((
const
float
*
)
&
from
);
return
Packet2cf
(
vcombine_f32
(
r64
,
r64
));
}
...
...
@@ -62,35 +78,32 @@ template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Pa
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pconj
(
const
Packet2cf
&
a
)
{
Packet4ui
b
=
vreinterpretq_u32_f32
(
a
.
v
);
return
Packet2cf
(
vreinterpretq_f32_u32
(
veorq_u32
(
b
,
p4ui_CONJ_XOR
)));
return
Packet2cf
(
vreinterpretq_f32_u32
(
veorq_u32
(
b
,
p4ui_CONJ_XOR
()
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pmul
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
Packet4f
v1
,
v2
;
float32x2_t
a_lo
,
a_hi
;
// Get the real values of a | a1_re | a1_re | a2_re | a2_re |
v1
=
vcombine_f32
(
vdup_lane_f32
(
vget_low_f32
(
a
.
v
),
0
),
vdup_lane_f32
(
vget_high_f32
(
a
.
v
),
0
));
// Get the
real
values of a | a1_im | a1_im | a2_im | a2_im |
// Get the
imag
values of a | a1_im | a1_im | a2_im | a2_im |
v2
=
vcombine_f32
(
vdup_lane_f32
(
vget_low_f32
(
a
.
v
),
1
),
vdup_lane_f32
(
vget_high_f32
(
a
.
v
),
1
));
// Multiply the real a with b
v1
=
vmulq_f32
(
v1
,
b
.
v
);
// Multiply the imag a with b
v2
=
vmulq_f32
(
v2
,
b
.
v
);
// Conjugate v2
v2
=
vreinterpretq_f32_u32
(
veorq_u32
(
vreinterpretq_u32_f32
(
v2
),
p4ui_CONJ_XOR
));
v2
=
vreinterpretq_f32_u32
(
veorq_u32
(
vreinterpretq_u32_f32
(
v2
),
p4ui_CONJ_XOR
()
));
// Swap real/imag elements in v2.
a_lo
=
vrev64_f32
(
vget_low_f32
(
v2
));
a_hi
=
vrev64_f32
(
vget_high_f32
(
v2
));
v2
=
vcombine_f32
(
a_lo
,
a_hi
);
v2
=
vrev64q_f32
(
v2
);
// Add and return the result
return
Packet2cf
(
vaddq_f32
(
v1
,
v2
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pand
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
vreinterpretq_f32_u32
(
v
orr
q_u32
(
vreinterpretq_u32_f32
(
a
.
v
),
vreinterpretq_u32_f32
(
b
.
v
))));
return
Packet2cf
(
vreinterpretq_f32_u32
(
v
and
q_u32
(
vreinterpretq_u32_f32
(
a
.
v
),
vreinterpretq_u32_f32
(
b
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
por
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
...
...
@@ -113,7 +126,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
((
float
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
((
float
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
float
>
>
(
const
std
::
complex
<
float
>
*
addr
)
{
__pld
((
float
*
)
addr
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet2cf
pgather
<
std
::
complex
<
float
>
,
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
,
Index
stride
)
{
Packet4f
res
=
pset1
<
Packet4f
>
(
0.
f
);
res
=
vsetq_lane_f32
(
std
::
real
(
from
[
0
*
stride
]),
res
,
0
);
res
=
vsetq_lane_f32
(
std
::
imag
(
from
[
0
*
stride
]),
res
,
1
);
res
=
vsetq_lane_f32
(
std
::
real
(
from
[
1
*
stride
]),
res
,
2
);
res
=
vsetq_lane_f32
(
std
::
imag
(
from
[
1
*
stride
]),
res
,
3
);
return
Packet2cf
(
res
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
std
::
complex
<
float
>
,
Packet2cf
>
(
std
::
complex
<
float
>*
to
,
const
Packet2cf
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
std
::
complex
<
float
>
(
vgetq_lane_f32
(
from
.
v
,
0
),
vgetq_lane_f32
(
from
.
v
,
1
));
to
[
stride
*
1
]
=
std
::
complex
<
float
>
(
vgetq_lane_f32
(
from
.
v
,
2
),
vgetq_lane_f32
(
from
.
v
,
3
));
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
float
>
>
(
const
std
::
complex
<
float
>
*
addr
)
{
EIGEN_ARM_PREFETCH
((
const
float
*
)
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
pfirst
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
...
...
@@ -180,7 +209,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
// Multiply the imag a with b
v2
=
vmul_f32
(
v2
,
a2
);
// Conjugate v2
v2
=
vreinterpret_f32_u32
(
veor_u32
(
vreinterpret_u32_f32
(
v2
),
p2ui_CONJ_XOR
));
v2
=
vreinterpret_f32_u32
(
veor_u32
(
vreinterpret_u32_f32
(
v2
),
p2ui_CONJ_XOR
()
));
// Swap real/imag elements in v2.
v2
=
vrev64_f32
(
v2
);
// Add v1, v2
...
...
@@ -236,21 +265,223 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL
(
Packet2cf
,
Packet4f
)
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pdiv
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
// TODO optimize it for
AltiVec
// TODO optimize it for
NEON
Packet2cf
res
=
conj_helper
<
Packet2cf
,
Packet2cf
,
false
,
true
>
().
pmul
(
a
,
b
);
Packet4f
s
,
rev_s
;
float32x2_t
a_lo
,
a_hi
;
// this computes the norm
s
=
vmulq_f32
(
b
.
v
,
b
.
v
);
a_lo
=
vrev64_f32
(
vget_low_f32
(
s
));
a_hi
=
vrev64_f32
(
vget_high_f32
(
s
));
rev_s
=
vcombine_f32
(
a_lo
,
a_hi
);
rev_s
=
vrev64q_f32
(
s
);
return
Packet2cf
(
pdiv
<
Packet4f
>
(
res
.
v
,
vaddq_f32
(
s
,
rev_s
)));
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet2cf
,
2
>&
kernel
)
{
Packet4f
tmp
=
vcombine_f32
(
vget_high_f32
(
kernel
.
packet
[
0
].
v
),
vget_high_f32
(
kernel
.
packet
[
1
].
v
));
kernel
.
packet
[
0
].
v
=
vcombine_f32
(
vget_low_f32
(
kernel
.
packet
[
0
].
v
),
vget_low_f32
(
kernel
.
packet
[
1
].
v
));
kernel
.
packet
[
1
].
v
=
tmp
;
}
//---------- double ----------
#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
// See bug 1325, clang fails to call vld1q_u64.
#if EIGEN_COMP_CLANG
static
uint64x2_t
p2ul_CONJ_XOR
=
{
0x0
,
0x8000000000000000
};
#else
const
uint64_t
p2ul_conj_XOR_DATA
[]
=
{
0x0
,
0x8000000000000000
};
static
uint64x2_t
p2ul_CONJ_XOR
=
vld1q_u64
(
p2ul_conj_XOR_DATA
);
#endif
struct
Packet1cd
{
EIGEN_STRONG_INLINE
Packet1cd
()
{}
EIGEN_STRONG_INLINE
explicit
Packet1cd
(
const
Packet2d
&
a
)
:
v
(
a
)
{}
Packet2d
v
;
};
template
<
>
struct
packet_traits
<
std
::
complex
<
double
>
>
:
default_packet_traits
{
typedef
Packet1cd
type
;
typedef
Packet1cd
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
0
,
size
=
1
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
HasMul
=
1
,
HasDiv
=
1
,
HasNegate
=
1
,
HasAbs
=
0
,
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasSetLinear
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet1cd
>
{
typedef
std
::
complex
<
double
>
type
;
enum
{
size
=
1
,
alignment
=
Aligned16
};
typedef
Packet1cd
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pload
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
Packet1cd
(
pload
<
Packet2d
>
((
const
double
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
ploadu
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
Packet1cd
(
ploadu
<
Packet2d
>
((
const
double
*
)
from
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pset1
<
Packet1cd
>
(
const
std
::
complex
<
double
>&
from
)
{
/* here we really have to use unaligned loads :( */
return
ploadu
<
Packet1cd
>
(
&
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
padd
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
padd
<
Packet2d
>
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
psub
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
psub
<
Packet2d
>
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pnegate
(
const
Packet1cd
&
a
)
{
return
Packet1cd
(
pnegate
<
Packet2d
>
(
a
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pconj
(
const
Packet1cd
&
a
)
{
return
Packet1cd
(
vreinterpretq_f64_u64
(
veorq_u64
(
vreinterpretq_u64_f64
(
a
.
v
),
p2ul_CONJ_XOR
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pmul
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
Packet2d
v1
,
v2
;
// Get the real values of a
v1
=
vdupq_lane_f64
(
vget_low_f64
(
a
.
v
),
0
);
// Get the imag values of a
v2
=
vdupq_lane_f64
(
vget_high_f64
(
a
.
v
),
0
);
// Multiply the real a with b
v1
=
vmulq_f64
(
v1
,
b
.
v
);
// Multiply the imag a with b
v2
=
vmulq_f64
(
v2
,
b
.
v
);
// Conjugate v2
v2
=
vreinterpretq_f64_u64
(
veorq_u64
(
vreinterpretq_u64_f64
(
v2
),
p2ul_CONJ_XOR
));
// Swap real/imag elements in v2.
v2
=
preverse
<
Packet2d
>
(
v2
);
// Add and return the result
return
Packet1cd
(
vaddq_f64
(
v1
,
v2
));
}
return
Packet2cf
(
pdiv
(
res
.
v
,
vaddq_f32
(
s
,
rev_s
)));
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pand
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
vreinterpretq_f64_u64
(
vandq_u64
(
vreinterpretq_u64_f64
(
a
.
v
),
vreinterpretq_u64_f64
(
b
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
por
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
vreinterpretq_f64_u64
(
vorrq_u64
(
vreinterpretq_u64_f64
(
a
.
v
),
vreinterpretq_u64_f64
(
b
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pxor
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
vreinterpretq_f64_u64
(
veorq_u64
(
vreinterpretq_u64_f64
(
a
.
v
),
vreinterpretq_u64_f64
(
b
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pandnot
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
vreinterpretq_f64_u64
(
vbicq_u64
(
vreinterpretq_u64_f64
(
a
.
v
),
vreinterpretq_u64_f64
(
b
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
ploaddup
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
return
pset1
<
Packet1cd
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
double
>
>
(
const
std
::
complex
<
double
>
*
addr
)
{
EIGEN_ARM_PREFETCH
((
const
double
*
)
addr
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet1cd
pgather
<
std
::
complex
<
double
>
,
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
,
Index
stride
)
{
Packet2d
res
=
pset1
<
Packet2d
>
(
0.0
);
res
=
vsetq_lane_f64
(
std
::
real
(
from
[
0
*
stride
]),
res
,
0
);
res
=
vsetq_lane_f64
(
std
::
imag
(
from
[
0
*
stride
]),
res
,
1
);
return
Packet1cd
(
res
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
std
::
complex
<
double
>
,
Packet1cd
>
(
std
::
complex
<
double
>*
to
,
const
Packet1cd
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
std
::
complex
<
double
>
(
vgetq_lane_f64
(
from
.
v
,
0
),
vgetq_lane_f64
(
from
.
v
,
1
));
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
pfirst
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
std
::
complex
<
double
>
EIGEN_ALIGN16
res
;
pstore
<
std
::
complex
<
double
>
>
(
&
res
,
a
);
return
res
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
preverse
(
const
Packet1cd
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
predux
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
return
pfirst
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
preduxp
<
Packet1cd
>
(
const
Packet1cd
*
vecs
)
{
return
vecs
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
predux_mul
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
return
pfirst
(
a
);
}
template
<
int
Offset
>
struct
palign_impl
<
Offset
,
Packet1cd
>
{
static
EIGEN_STRONG_INLINE
void
run
(
Packet1cd
&
/*first*/
,
const
Packet1cd
&
/*second*/
)
{
// FIXME is it sure we never have to align a Packet1cd?
// Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
false
,
true
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
return
internal
::
pmul
(
a
,
pconj
(
b
));
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
true
,
false
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
return
internal
::
pmul
(
pconj
(
a
),
b
);
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet1cd
,
true
,
true
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
pmul
(
x
,
y
),
c
);
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
const
{
return
pconj
(
internal
::
pmul
(
a
,
b
));
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL
(
Packet1cd
,
Packet2d
)
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pdiv
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
// TODO optimize it for NEON
Packet1cd
res
=
conj_helper
<
Packet1cd
,
Packet1cd
,
false
,
true
>
().
pmul
(
a
,
b
);
Packet2d
s
=
pmul
<
Packet2d
>
(
b
.
v
,
b
.
v
);
Packet2d
rev_s
=
preverse
<
Packet2d
>
(
s
);
return
Packet1cd
(
pdiv
(
res
.
v
,
padd
<
Packet2d
>
(
s
,
rev_s
)));
}
EIGEN_STRONG_INLINE
Packet1cd
pcplxflip
/*<Packet1cd>*/
(
const
Packet1cd
&
x
)
{
return
Packet1cd
(
preverse
(
Packet2d
(
x
.
v
)));
}
EIGEN_STRONG_INLINE
void
ptranspose
(
PacketBlock
<
Packet1cd
,
2
>&
kernel
)
{
Packet2d
tmp
=
vcombine_f64
(
vget_high_f64
(
kernel
.
packet
[
0
].
v
),
vget_high_f64
(
kernel
.
packet
[
1
].
v
));
kernel
.
packet
[
0
].
v
=
vcombine_f64
(
vget_low_f64
(
kernel
.
packet
[
0
].
v
),
vget_low_f64
(
kernel
.
packet
[
1
].
v
));
kernel
.
packet
[
1
].
v
=
tmp
;
}
#endif // EIGEN_ARCH_ARM64
}
// end namespace internal
...
...
pydensecrf/densecrf/include/Eigen/src/Core/arch/NEON/MathFunctions.h
0 → 100644
View file @
13b115ab
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/* The sin, cos, exp, and log functions of this file come from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
#ifndef EIGEN_MATH_FUNCTIONS_NEON_H
#define EIGEN_MATH_FUNCTIONS_NEON_H
namespace
Eigen
{
namespace
internal
{
template
<
>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet4f
pexp
<
Packet4f
>
(
const
Packet4f
&
_x
)
{
Packet4f
x
=
_x
;
Packet4f
tmp
,
fx
;
_EIGEN_DECLARE_CONST_Packet4f
(
1
,
1.0
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
half
,
0.5
f
);
_EIGEN_DECLARE_CONST_Packet4i
(
0x7f
,
0x7f
);
_EIGEN_DECLARE_CONST_Packet4f
(
exp_hi
,
88.3762626647950
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
exp_lo
,
-
88.3762626647949
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_LOG2EF
,
1.44269504088896341
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_C1
,
0.693359375
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_C2
,
-
2.12194440e-4
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p0
,
1.9875691500E-4
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p1
,
1.3981999507E-3
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p2
,
8.3334519073E-3
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p3
,
4.1665795894E-2
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p4
,
1.6666665459E-1
f
);
_EIGEN_DECLARE_CONST_Packet4f
(
cephes_exp_p5
,
5.0000001201E-1
f
);
x
=
vminq_f32
(
x
,
p4f_exp_hi
);
x
=
vmaxq_f32
(
x
,
p4f_exp_lo
);
/* express exp(x) as exp(g + n*log(2)) */
fx
=
vmlaq_f32
(
p4f_half
,
x
,
p4f_cephes_LOG2EF
);
/* perform a floorf */
tmp
=
vcvtq_f32_s32
(
vcvtq_s32_f32
(
fx
));
/* if greater, substract 1 */
Packet4ui
mask
=
vcgtq_f32
(
tmp
,
fx
);
mask
=
vandq_u32
(
mask
,
vreinterpretq_u32_f32
(
p4f_1
));
fx
=
vsubq_f32
(
tmp
,
vreinterpretq_f32_u32
(
mask
));
tmp
=
vmulq_f32
(
fx
,
p4f_cephes_exp_C1
);
Packet4f
z
=
vmulq_f32
(
fx
,
p4f_cephes_exp_C2
);
x
=
vsubq_f32
(
x
,
tmp
);
x
=
vsubq_f32
(
x
,
z
);
Packet4f
y
=
vmulq_f32
(
p4f_cephes_exp_p0
,
x
);
z
=
vmulq_f32
(
x
,
x
);
y
=
vaddq_f32
(
y
,
p4f_cephes_exp_p1
);
y
=
vmulq_f32
(
y
,
x
);
y
=
vaddq_f32
(
y
,
p4f_cephes_exp_p2
);
y
=
vmulq_f32
(
y
,
x
);
y
=
vaddq_f32
(
y
,
p4f_cephes_exp_p3
);
y
=
vmulq_f32
(
y
,
x
);
y
=
vaddq_f32
(
y
,
p4f_cephes_exp_p4
);
y
=
vmulq_f32
(
y
,
x
);
y
=
vaddq_f32
(
y
,
p4f_cephes_exp_p5
);
y
=
vmulq_f32
(
y
,
z
);
y
=
vaddq_f32
(
y
,
x
);
y
=
vaddq_f32
(
y
,
p4f_1
);
/* build 2^n */
int32x4_t
mm
;
mm
=
vcvtq_s32_f32
(
fx
);
mm
=
vaddq_s32
(
mm
,
p4i_0x7f
);
mm
=
vshlq_n_s32
(
mm
,
23
);
Packet4f
pow2n
=
vreinterpretq_f32_s32
(
mm
);
y
=
vmulq_f32
(
y
,
pow2n
);
return
y
;
}
}
// end namespace internal
}
// end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_NEON_H
pydensecrf/densecrf/include/Eigen/src/Core/arch/NEON/PacketMath.h
View file @
13b115ab
...
...
@@ -2,7 +2,7 @@
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2010 Konstantinos Margaritis <markos@
codex.gr
>
// Copyright (C) 2010 Konstantinos Margaritis <markos@
freevec.org
>
// Heavily based on Gael's SSE version.
//
// This Source Code Form is subject to the terms of the Mozilla
...
...
@@ -20,89 +20,141 @@ namespace internal {
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
// FIXME NEON has 16 quad registers, but since the current register allocator
// is so bad, it is much better to reduce it to 8
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
#if EIGEN_ARCH_ARM64
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
#else
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
#endif
#endif
#if EIGEN_COMP_MSVC
// In MSVC's arm_neon.h header file, all NEON vector types
// are aliases to the same underlying type __n128.
// We thus have to wrap them to make them different C++ types.
// (See also bug 1428)
template
<
typename
T
,
int
unique_id
>
struct
eigen_packet_wrapper
{
operator
T
&
()
{
return
m_val
;
}
operator
const
T
&
()
const
{
return
m_val
;
}
eigen_packet_wrapper
()
{}
eigen_packet_wrapper
(
const
T
&
v
)
:
m_val
(
v
)
{}
eigen_packet_wrapper
&
operator
=
(
const
T
&
v
)
{
m_val
=
v
;
return
*
this
;
}
T
m_val
;
};
typedef
eigen_packet_wrapper
<
float32x2_t
,
0
>
Packet2f
;
typedef
eigen_packet_wrapper
<
float32x4_t
,
1
>
Packet4f
;
typedef
eigen_packet_wrapper
<
int32x4_t
,
2
>
Packet4i
;
typedef
eigen_packet_wrapper
<
int32x2_t
,
3
>
Packet2i
;
typedef
eigen_packet_wrapper
<
uint32x4_t
,
4
>
Packet4ui
;
#else
typedef
float32x2_t
Packet2f
;
typedef
float32x4_t
Packet4f
;
typedef
int32x4_t
Packet4i
;
typedef
int32x2_t
Packet2i
;
typedef
uint32x4_t
Packet4ui
;
#endif // EIGEN_COMP_MSVC
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f p4f_##NAME = pset1<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int
32_t
>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
#if defined(__llvm__) && !defined(__clang__)
//Special treatment for Apple's llvm-gcc, its NEON packet types are unions
#define EIGEN_INIT_NEON_PACKET2(X, Y) {{X, Y}}
#define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
#if EIGEN_ARCH_ARM64
// __builtin_prefetch tends to do nothing on ARM64 compilers because the
// prefetch instructions there are too detailed for __builtin_prefetch to map
// meaningfully to them.
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
#define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
#elif defined __pld
#define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
#elif EIGEN_ARCH_ARM32
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
#else
//Default initializer for packets
#define EIGEN_INIT_NEON_PACKET2(X, Y) {X, Y}
#define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
#endif
#ifndef __pld
#define __pld(x) asm volatile ( " pld [%[addr]]\n" :: [addr] "r" (x) : "cc" );
// by default no explicit prefetching
#define EIGEN_ARM_PREFETCH(ADDR)
#endif
template
<
>
struct
packet_traits
<
float
>
:
default_packet_traits
{
typedef
Packet4f
type
;
typedef
Packet4f
half
;
// Packet2f intrinsics not implemented yet
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
,
HasHalfPacket
=
0
,
// Packet2f intrinsics not implemented yet
HasDiv
=
1
,
// FIXME check the Has*
HasSin
=
0
,
HasCos
=
0
,
HasLog
=
0
,
HasExp
=
0
,
HasExp
=
1
,
HasSqrt
=
0
};
};
template
<
>
struct
packet_traits
<
int
>
:
default_packet_traits
template
<
>
struct
packet_traits
<
int
32_t
>
:
default_packet_traits
{
typedef
Packet4i
type
;
typedef
Packet4i
half
;
// Packet2i intrinsics not implemented yet
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
4
size
=
4
,
HasHalfPacket
=
0
// Packet2i intrinsics not implemented yet
// FIXME check the Has*
};
};
#if EIGEN_GNUC_AT_MOST(4,4) && !
defined(__llvm__)
#if EIGEN_GNUC_AT_MOST(4,4) && !
EIGEN_COMP_LLVM
// workaround gcc 4.2, 4.3 and 4.4 compilatin issue
EIGEN_STRONG_INLINE
float32x4_t
vld1q_f32
(
const
float
*
x
)
{
return
::
vld1q_f32
((
const
float32_t
*
)
x
);
}
EIGEN_STRONG_INLINE
float32x2_t
vld1_f32
(
const
float
*
x
)
{
return
::
vld1_f32
((
const
float32_t
*
)
x
);
}
EIGEN_STRONG_INLINE
float32x2_t
vld1_dup_f32
(
const
float
*
x
)
{
return
::
vld1_dup_f32
((
const
float32_t
*
)
x
);
}
EIGEN_STRONG_INLINE
void
vst1q_f32
(
float
*
to
,
float32x4_t
from
)
{
::
vst1q_f32
((
float32_t
*
)
to
,
from
);
}
EIGEN_STRONG_INLINE
void
vst1_f32
(
float
*
to
,
float32x2_t
from
)
{
::
vst1_f32
((
float32_t
*
)
to
,
from
);
}
#endif
template
<
>
struct
unpacket_traits
<
Packet4f
>
{
typedef
float
type
;
enum
{
size
=
4
}
;
};
template
<
>
struct
unpacket_traits
<
Packet4i
>
{
typedef
int
type
;
enum
{
size
=
4
}
;
};
template
<
>
struct
unpacket_traits
<
Packet4f
>
{
typedef
float
type
;
enum
{
size
=
4
,
alignment
=
Aligned16
};
typedef
Packet4f
half
;
};
template
<
>
struct
unpacket_traits
<
Packet4i
>
{
typedef
int
32_t
type
;
enum
{
size
=
4
,
alignment
=
Aligned16
};
typedef
Packet4i
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pset1
<
Packet4f
>
(
const
float
&
from
)
{
return
vdupq_n_f32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pset1
<
Packet4i
>
(
const
int
&
from
)
{
return
vdupq_n_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pset1
<
Packet4i
>
(
const
int
32_t
&
from
)
{
return
vdupq_n_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
plset
<
float
>
(
const
float
&
a
)
template
<
>
EIGEN_STRONG_INLINE
Packet4f
plset
<
Packet4f
>
(
const
float
&
a
)
{
Packet4f
countdown
=
EIGEN_INIT_NEON_PACKET4
(
0
,
1
,
2
,
3
);
const
float
f
[]
=
{
0
,
1
,
2
,
3
};
Packet4f
countdown
=
vld1q_f32
(
f
);
return
vaddq_f32
(
pset1
<
Packet4f
>
(
a
),
countdown
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
plset
<
int
>
(
const
int
&
a
)
template
<
>
EIGEN_STRONG_INLINE
Packet4i
plset
<
Packet4i
>
(
const
int
32_t
&
a
)
{
Packet4i
countdown
=
EIGEN_INIT_NEON_PACKET4
(
0
,
1
,
2
,
3
);
const
int32_t
i
[]
=
{
0
,
1
,
2
,
3
};
Packet4i
countdown
=
vld1q_s32
(
i
);
return
vaddq_s32
(
pset1
<
Packet4i
>
(
a
),
countdown
);
}
...
...
@@ -115,11 +167,17 @@ template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pnegate
(
const
Packet4f
&
a
)
{
return
vnegq_f32
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pnegate
(
const
Packet4i
&
a
)
{
return
vnegq_s32
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pconj
(
const
Packet4f
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pconj
(
const
Packet4i
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmul
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vmulq_f32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmul
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vmulq_s32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pdiv
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
#if EIGEN_ARCH_ARM64
return
vdivq_f32
(
a
,
b
);
#else
Packet4f
inv
,
restep
,
div
;
// NEON does not offer a divide instruction, we have to do a reciprocal approximation
...
...
@@ -138,14 +196,51 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
div
=
vmulq_f32
(
a
,
inv
);
return
div
;
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pdiv
<
Packet4i
>
(
const
Packet4i
&
/*a*/
,
const
Packet4i
&
/*b*/
)
{
eigen_assert
(
false
&&
"packet integer division are not supported by NEON"
);
return
pset1
<
Packet4i
>
(
0
);
}
// for some weird raisons, it has to be overloaded for packet of integers
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmadd
(
const
Packet4f
&
a
,
const
Packet4f
&
b
,
const
Packet4f
&
c
)
{
return
vmlaq_f32
(
c
,
a
,
b
);
}
// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available,
// then implements a slow software scalar fallback calling fmaf()!
// Filed LLVM bug:
// https://llvm.org/bugs/show_bug.cgi?id=27216
#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
// See bug 936.
// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
// MLA is not fused i.e. does 2 roundings.
// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmadd
(
const
Packet4f
&
a
,
const
Packet4f
&
b
,
const
Packet4f
&
c
)
{
return
vfmaq_f32
(
c
,
a
,
b
);
}
#else
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmadd
(
const
Packet4f
&
a
,
const
Packet4f
&
b
,
const
Packet4f
&
c
)
{
#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM
// Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu,
// at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on
// -march=armv7-a, that is a very common case.
// See e.g. this thread:
// http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html
// Filed LLVM bug:
// https://llvm.org/bugs/show_bug.cgi?id=27219
Packet4f
r
=
c
;
asm
volatile
(
"vmla.f32 %q[r], %q[a], %q[b]"
:
[
r
]
"+w"
(
r
)
:
[
a
]
"w"
(
a
),
[
b
]
"w"
(
b
)
:
);
return
r
;
#else
return
vmlaq_f32
(
c
,
a
,
b
);
#endif
}
#endif
// No FMA instruction for int, so use MLA unconditionally.
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pmadd
(
const
Packet4i
&
a
,
const
Packet4i
&
b
,
const
Packet4i
&
c
)
{
return
vmlaq_s32
(
c
,
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pmin
<
Packet4f
>
(
const
Packet4f
&
a
,
const
Packet4f
&
b
)
{
return
vminq_f32
(
a
,
b
);
}
...
...
@@ -180,38 +275,72 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pandnot
<
Packet4i
>
(
const
Packet4i
&
a
,
const
Packet4i
&
b
)
{
return
vbicq_s32
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pload
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vld1q_f32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pload
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vld1q_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pload
<
Packet4i
>
(
const
int
32_t
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vld1q_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploadu
<
Packet4f
>
(
const
float
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
vld1q_f32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploadu
<
Packet4i
>
(
const
int
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
vld1q_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploadu
<
Packet4i
>
(
const
int
32_t
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
vld1q_s32
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
ploaddup
<
Packet4f
>
(
const
float
*
from
)
{
float32x2_t
lo
,
hi
;
lo
=
vdup_
n_
f32
(
*
from
);
hi
=
vdup_
n_
f32
(
*
(
from
+
1
)
)
;
lo
=
v
ld1_
dup_f32
(
from
);
hi
=
v
ld1_
dup_f32
(
from
+
1
);
return
vcombine_f32
(
lo
,
hi
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploaddup
<
Packet4i
>
(
const
int
*
from
)
template
<
>
EIGEN_STRONG_INLINE
Packet4i
ploaddup
<
Packet4i
>
(
const
int
32_t
*
from
)
{
int32x2_t
lo
,
hi
;
lo
=
vdup_
n_
s32
(
*
from
);
hi
=
vdup_
n_
s32
(
*
(
from
+
1
)
)
;
lo
=
v
ld1_
dup_s32
(
from
);
hi
=
v
ld1_
dup_s32
(
from
+
1
);
return
vcombine_s32
(
lo
,
hi
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vst1q_f32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vst1q_s32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vst1q_f32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
int
32_t
>
(
int
32_t
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vst1q_s32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
vst1q_f32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
int
>
(
int
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
vst1q_s32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
float
>
(
float
*
to
,
const
Packet4f
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
vst1q_f32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
int
32_t
>
(
int
32_t
*
to
,
const
Packet4i
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
vst1q_s32
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
float
>
(
const
float
*
addr
)
{
__pld
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
int
>
(
const
int
*
addr
)
{
__pld
(
addr
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet4f
pgather
<
float
,
Packet4f
>
(
const
float
*
from
,
Index
stride
)
{
Packet4f
res
=
pset1
<
Packet4f
>
(
0.
f
);
res
=
vsetq_lane_f32
(
from
[
0
*
stride
],
res
,
0
);
res
=
vsetq_lane_f32
(
from
[
1
*
stride
],
res
,
1
);
res
=
vsetq_lane_f32
(
from
[
2
*
stride
],
res
,
2
);
res
=
vsetq_lane_f32
(
from
[
3
*
stride
],
res
,
3
);
return
res
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet4i
pgather
<
int32_t
,
Packet4i
>
(
const
int32_t
*
from
,
Index
stride
)
{
Packet4i
res
=
pset1
<
Packet4i
>
(
0
);
res
=
vsetq_lane_s32
(
from
[
0
*
stride
],
res
,
0
);
res
=
vsetq_lane_s32
(
from
[
1
*
stride
],
res
,
1
);
res
=
vsetq_lane_s32
(
from
[
2
*
stride
],
res
,
2
);
res
=
vsetq_lane_s32
(
from
[
3
*
stride
],
res
,
3
);
return
res
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
float
,
Packet4f
>
(
float
*
to
,
const
Packet4f
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
vgetq_lane_f32
(
from
,
0
);
to
[
stride
*
1
]
=
vgetq_lane_f32
(
from
,
1
);
to
[
stride
*
2
]
=
vgetq_lane_f32
(
from
,
2
);
to
[
stride
*
3
]
=
vgetq_lane_f32
(
from
,
3
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
int32_t
,
Packet4i
>
(
int32_t
*
to
,
const
Packet4i
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
vgetq_lane_s32
(
from
,
0
);
to
[
stride
*
1
]
=
vgetq_lane_s32
(
from
,
1
);
to
[
stride
*
2
]
=
vgetq_lane_s32
(
from
,
2
);
to
[
stride
*
3
]
=
vgetq_lane_s32
(
from
,
3
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
float
>
(
const
float
*
addr
)
{
EIGEN_ARM_PREFETCH
(
addr
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
int32_t
>
(
const
int32_t
*
addr
)
{
EIGEN_ARM_PREFETCH
(
addr
);
}
// FIXME only store the 2 first elements ?
template
<
>
EIGEN_STRONG_INLINE
float
pfirst
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float
EIGEN_ALIGN16
x
[
4
];
vst1q_f32
(
x
,
a
);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
int
pfirst
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int
EIGEN_ALIGN16
x
[
4
];
vst1q_s32
(
x
,
a
);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
int
32_t
pfirst
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int
32_t
EIGEN_ALIGN16
x
[
4
];
vst1q_s32
(
x
,
a
);
return
x
[
0
];
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
preverse
(
const
Packet4f
&
a
)
{
float32x2_t
a_lo
,
a_hi
;
...
...
@@ -231,21 +360,19 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
a_hi
=
vget_high_s32
(
a_r64
);
return
vcombine_s32
(
a_hi
,
a_lo
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
pabs
(
const
Packet4f
&
a
)
{
return
vabsq_f32
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
pabs
(
const
Packet4i
&
a
)
{
return
vabsq_s32
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
float
predux
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float32x2_t
a_lo
,
a_hi
,
sum
;
float
s
[
2
];
a_lo
=
vget_low_f32
(
a
);
a_hi
=
vget_high_f32
(
a
);
sum
=
vpadd_f32
(
a_lo
,
a_hi
);
sum
=
vpadd_f32
(
sum
,
sum
);
vst1_f32
(
s
,
sum
);
return
s
[
0
];
return
vget_lane_f32
(
sum
,
0
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4f
preduxp
<
Packet4f
>
(
const
Packet4f
*
vecs
)
...
...
@@ -268,18 +395,15 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
return
sum
;
}
template
<
>
EIGEN_STRONG_INLINE
int
predux
<
Packet4i
>
(
const
Packet4i
&
a
)
template
<
>
EIGEN_STRONG_INLINE
int
32_t
predux
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int32x2_t
a_lo
,
a_hi
,
sum
;
int32_t
s
[
2
];
a_lo
=
vget_low_s32
(
a
);
a_hi
=
vget_high_s32
(
a
);
sum
=
vpadd_s32
(
a_lo
,
a_hi
);
sum
=
vpadd_s32
(
sum
,
sum
);
vst1_s32
(
s
,
sum
);
return
s
[
0
];
return
vget_lane_s32
(
sum
,
0
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet4i
preduxp
<
Packet4i
>
(
const
Packet4i
*
vecs
)
...
...
@@ -307,7 +431,6 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
template
<
>
EIGEN_STRONG_INLINE
float
predux_mul
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float32x2_t
a_lo
,
a_hi
,
prod
;
float
s
[
2
];
// Get a_lo = |a1|a2| and a_hi = |a3|a4|
a_lo
=
vget_low_f32
(
a
);
...
...
@@ -316,14 +439,12 @@ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
prod
=
vmul_f32
(
a_lo
,
a_hi
);
// Multiply prod with its swapped value |a2*a4|a1*a3|
prod
=
vmul_f32
(
prod
,
vrev64_f32
(
prod
));
vst1_f32
(
s
,
prod
);
return
s
[
0
]
;
return
vget_lane_f32
(
prod
,
0
)
;
}
template
<
>
EIGEN_STRONG_INLINE
int
predux_mul
<
Packet4i
>
(
const
Packet4i
&
a
)
template
<
>
EIGEN_STRONG_INLINE
int
32_t
predux_mul
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int32x2_t
a_lo
,
a_hi
,
prod
;
int32_t
s
[
2
];
// Get a_lo = |a1|a2| and a_hi = |a3|a4|
a_lo
=
vget_low_s32
(
a
);
...
...
@@ -332,65 +453,58 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
prod
=
vmul_s32
(
a_lo
,
a_hi
);
// Multiply prod with its swapped value |a2*a4|a1*a3|
prod
=
vmul_s32
(
prod
,
vrev64_s32
(
prod
));
vst1_s32
(
s
,
prod
);
return
s
[
0
]
;
return
vget_lane_s32
(
prod
,
0
)
;
}
// min
template
<
>
EIGEN_STRONG_INLINE
float
predux_min
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float32x2_t
a_lo
,
a_hi
,
min
;
float
s
[
2
];
a_lo
=
vget_low_f32
(
a
);
a_hi
=
vget_high_f32
(
a
);
min
=
vpmin_f32
(
a_lo
,
a_hi
);
min
=
vpmin_f32
(
min
,
min
);
vst1_f32
(
s
,
min
);
return
s
[
0
]
;
return
vget_lane_f32
(
min
,
0
)
;
}
template
<
>
EIGEN_STRONG_INLINE
int
predux_min
<
Packet4i
>
(
const
Packet4i
&
a
)
template
<
>
EIGEN_STRONG_INLINE
int32_t
predux_min
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int32x2_t
a_lo
,
a_hi
,
min
;
int32_t
s
[
2
];
a_lo
=
vget_low_s32
(
a
);
a_hi
=
vget_high_s32
(
a
);
min
=
vpmin_s32
(
a_lo
,
a_hi
);
min
=
vpmin_s32
(
min
,
min
);
vst1_s32
(
s
,
min
);
return
s
[
0
]
;
return
vget_lane_s32
(
min
,
0
)
;
}
// max
template
<
>
EIGEN_STRONG_INLINE
float
predux_max
<
Packet4f
>
(
const
Packet4f
&
a
)
{
float32x2_t
a_lo
,
a_hi
,
max
;
float
s
[
2
];
a_lo
=
vget_low_f32
(
a
);
a_hi
=
vget_high_f32
(
a
);
max
=
vpmax_f32
(
a_lo
,
a_hi
);
max
=
vpmax_f32
(
max
,
max
);
vst1_f32
(
s
,
max
);
return
s
[
0
]
;
return
vget_lane_f32
(
max
,
0
)
;
}
template
<
>
EIGEN_STRONG_INLINE
int
predux_max
<
Packet4i
>
(
const
Packet4i
&
a
)
template
<
>
EIGEN_STRONG_INLINE
int32_t
predux_max
<
Packet4i
>
(
const
Packet4i
&
a
)
{
int32x2_t
a_lo
,
a_hi
,
max
;
int32_t
s
[
2
];
a_lo
=
vget_low_s32
(
a
);
a_hi
=
vget_high_s32
(
a
);
max
=
vpmax_s32
(
a_lo
,
a_hi
);
max
=
vpmax_s32
(
max
,
max
);
vst1_s32
(
s
,
max
);
return
s
[
0
]
;
return
vget_lane_s32
(
max
,
0
)
;
}
// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
...
...
@@ -417,6 +531,228 @@ PALIGN_NEON(3,Packet4i,vextq_s32)
#undef PALIGN_NEON
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet4f
,
4
>&
kernel
)
{
float32x4x2_t
tmp1
=
vzipq_f32
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
float32x4x2_t
tmp2
=
vzipq_f32
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
kernel
.
packet
[
0
]
=
vcombine_f32
(
vget_low_f32
(
tmp1
.
val
[
0
]),
vget_low_f32
(
tmp2
.
val
[
0
]));
kernel
.
packet
[
1
]
=
vcombine_f32
(
vget_high_f32
(
tmp1
.
val
[
0
]),
vget_high_f32
(
tmp2
.
val
[
0
]));
kernel
.
packet
[
2
]
=
vcombine_f32
(
vget_low_f32
(
tmp1
.
val
[
1
]),
vget_low_f32
(
tmp2
.
val
[
1
]));
kernel
.
packet
[
3
]
=
vcombine_f32
(
vget_high_f32
(
tmp1
.
val
[
1
]),
vget_high_f32
(
tmp2
.
val
[
1
]));
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet4i
,
4
>&
kernel
)
{
int32x4x2_t
tmp1
=
vzipq_s32
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
int32x4x2_t
tmp2
=
vzipq_s32
(
kernel
.
packet
[
2
],
kernel
.
packet
[
3
]);
kernel
.
packet
[
0
]
=
vcombine_s32
(
vget_low_s32
(
tmp1
.
val
[
0
]),
vget_low_s32
(
tmp2
.
val
[
0
]));
kernel
.
packet
[
1
]
=
vcombine_s32
(
vget_high_s32
(
tmp1
.
val
[
0
]),
vget_high_s32
(
tmp2
.
val
[
0
]));
kernel
.
packet
[
2
]
=
vcombine_s32
(
vget_low_s32
(
tmp1
.
val
[
1
]),
vget_low_s32
(
tmp2
.
val
[
1
]));
kernel
.
packet
[
3
]
=
vcombine_s32
(
vget_high_s32
(
tmp1
.
val
[
1
]),
vget_high_s32
(
tmp2
.
val
[
1
]));
}
//---------- double ----------
// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
// Confirmed at least with __apple_build_version__ = 6000054.
#ifdef __apple_build_version__
// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
// major toolchain updates.
#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
#else
#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
#endif
#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
// Bug 907: workaround missing declarations of the following two functions in the ADK
// Defining these functions as templates ensures that if these intrinsics are
// already defined in arm_neon.h, then our workaround doesn't cause a conflict
// and has lower priority in overload resolution.
template
<
typename
T
>
uint64x2_t
vreinterpretq_u64_f64
(
T
a
)
{
return
(
uint64x2_t
)
a
;
}
template
<
typename
T
>
float64x2_t
vreinterpretq_f64_u64
(
T
a
)
{
return
(
float64x2_t
)
a
;
}
typedef
float64x2_t
Packet2d
;
typedef
float64x1_t
Packet1d
;
template
<
>
struct
packet_traits
<
double
>
:
default_packet_traits
{
typedef
Packet2d
type
;
typedef
Packet2d
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
0
,
HasDiv
=
1
,
// FIXME check the Has*
HasSin
=
0
,
HasCos
=
0
,
HasLog
=
0
,
HasExp
=
0
,
HasSqrt
=
0
};
};
template
<
>
struct
unpacket_traits
<
Packet2d
>
{
typedef
double
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
Packet2d
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pset1
<
Packet2d
>
(
const
double
&
from
)
{
return
vdupq_n_f64
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
plset
<
Packet2d
>
(
const
double
&
a
)
{
const
double
countdown_raw
[]
=
{
0.0
,
1.0
};
const
Packet2d
countdown
=
vld1q_f64
(
countdown_raw
);
return
vaddq_f64
(
pset1
<
Packet2d
>
(
a
),
countdown
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
padd
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vaddq_f64
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
psub
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vsubq_f64
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pnegate
(
const
Packet2d
&
a
)
{
return
vnegq_f64
(
a
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pconj
(
const
Packet2d
&
a
)
{
return
a
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmul
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vmulq_f64
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pdiv
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vdivq_f64
(
a
,
b
);
}
#ifdef __ARM_FEATURE_FMA
// See bug 936. See above comment about FMA for float.
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmadd
(
const
Packet2d
&
a
,
const
Packet2d
&
b
,
const
Packet2d
&
c
)
{
return
vfmaq_f64
(
c
,
a
,
b
);
}
#else
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmadd
(
const
Packet2d
&
a
,
const
Packet2d
&
b
,
const
Packet2d
&
c
)
{
return
vmlaq_f64
(
c
,
a
,
b
);
}
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmin
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vminq_f64
(
a
,
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pmax
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vmaxq_f64
(
a
,
b
);
}
// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pand
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vreinterpretq_f64_u64
(
vandq_u64
(
vreinterpretq_u64_f64
(
a
),
vreinterpretq_u64_f64
(
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
por
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vreinterpretq_f64_u64
(
vorrq_u64
(
vreinterpretq_u64_f64
(
a
),
vreinterpretq_u64_f64
(
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pxor
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vreinterpretq_f64_u64
(
veorq_u64
(
vreinterpretq_u64_f64
(
a
),
vreinterpretq_u64_f64
(
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pandnot
<
Packet2d
>
(
const
Packet2d
&
a
,
const
Packet2d
&
b
)
{
return
vreinterpretq_f64_u64
(
vbicq_u64
(
vreinterpretq_u64_f64
(
a
),
vreinterpretq_u64_f64
(
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pload
<
Packet2d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
vld1q_f64
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
ploadu
<
Packet2d
>
(
const
double
*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
vld1q_f64
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
ploaddup
<
Packet2d
>
(
const
double
*
from
)
{
return
vld1q_dup_f64
(
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
double
>
(
double
*
to
,
const
Packet2d
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
vst1q_f64
(
to
,
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
double
>
(
double
*
to
,
const
Packet2d
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
vst1q_f64
(
to
,
from
);
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet2d
pgather
<
double
,
Packet2d
>
(
const
double
*
from
,
Index
stride
)
{
Packet2d
res
=
pset1
<
Packet2d
>
(
0.0
);
res
=
vsetq_lane_f64
(
from
[
0
*
stride
],
res
,
0
);
res
=
vsetq_lane_f64
(
from
[
1
*
stride
],
res
,
1
);
return
res
;
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
double
,
Packet2d
>
(
double
*
to
,
const
Packet2d
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
vgetq_lane_f64
(
from
,
0
);
to
[
stride
*
1
]
=
vgetq_lane_f64
(
from
,
1
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
double
>
(
const
double
*
addr
)
{
EIGEN_ARM_PREFETCH
(
addr
);
}
// FIXME only store the 2 first elements ?
template
<
>
EIGEN_STRONG_INLINE
double
pfirst
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vgetq_lane_f64
(
a
,
0
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
preverse
(
const
Packet2d
&
a
)
{
return
vcombine_f64
(
vget_high_f64
(
a
),
vget_low_f64
(
a
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2d
pabs
(
const
Packet2d
&
a
)
{
return
vabsq_f64
(
a
);
}
#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
// workaround ICE, see bug 907
template
<
>
EIGEN_STRONG_INLINE
double
predux
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
(
vget_low_f64
(
a
)
+
vget_high_f64
(
a
))[
0
];
}
#else
template
<
>
EIGEN_STRONG_INLINE
double
predux
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vget_lane_f64
(
vget_low_f64
(
a
)
+
vget_high_f64
(
a
),
0
);
}
#endif
template
<
>
EIGEN_STRONG_INLINE
Packet2d
preduxp
<
Packet2d
>
(
const
Packet2d
*
vecs
)
{
float64x2_t
trn1
,
trn2
;
// NEON zip performs interleaving of the supplied vectors.
// We perform two interleaves in a row to acquire the transposed vector
trn1
=
vzip1q_f64
(
vecs
[
0
],
vecs
[
1
]);
trn2
=
vzip2q_f64
(
vecs
[
0
],
vecs
[
1
]);
// Do the addition of the resulting vectors
return
vaddq_f64
(
trn1
,
trn2
);
}
// Other reduction functions:
// mul
#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
template
<
>
EIGEN_STRONG_INLINE
double
predux_mul
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
(
vget_low_f64
(
a
)
*
vget_high_f64
(
a
))[
0
];
}
#else
template
<
>
EIGEN_STRONG_INLINE
double
predux_mul
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vget_lane_f64
(
vget_low_f64
(
a
)
*
vget_high_f64
(
a
),
0
);
}
#endif
// min
template
<
>
EIGEN_STRONG_INLINE
double
predux_min
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vgetq_lane_f64
(
vpminq_f64
(
a
,
a
),
0
);
}
// max
template
<
>
EIGEN_STRONG_INLINE
double
predux_max
<
Packet2d
>
(
const
Packet2d
&
a
)
{
return
vgetq_lane_f64
(
vpmaxq_f64
(
a
,
a
),
0
);
}
// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
#define PALIGN_NEON(Offset,Type,Command) \
template<>\
struct palign_impl<Offset,Type>\
{\
EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
{\
if (Offset!=0)\
first = Command(first, second, Offset);\
}\
};\
PALIGN_NEON
(
0
,
Packet2d
,
vextq_f64
)
PALIGN_NEON
(
1
,
Packet2d
,
vextq_f64
)
#undef PALIGN_NEON
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet2d
,
2
>&
kernel
)
{
float64x2_t
trn1
=
vzip1q_f64
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
float64x2_t
trn2
=
vzip2q_f64
(
kernel
.
packet
[
0
],
kernel
.
packet
[
1
]);
kernel
.
packet
[
0
]
=
trn1
;
kernel
.
packet
[
1
]
=
trn2
;
}
#endif // EIGEN_ARCH_ARM64
}
// end namespace internal
}
// end namespace Eigen
...
...
pydensecrf/densecrf/include/Eigen/src/Core/arch/SSE/Complex.h
View file @
13b115ab
...
...
@@ -22,13 +22,18 @@ struct Packet2cf
__m128
v
;
};
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
// to leverage AVX instructions.
#ifndef EIGEN_VECTORIZE_AVX
template
<
>
struct
packet_traits
<
std
::
complex
<
float
>
>
:
default_packet_traits
{
typedef
Packet2cf
type
;
typedef
Packet2cf
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
1
,
size
=
2
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
...
...
@@ -39,11 +44,13 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasAbs2
=
0
,
HasMin
=
0
,
HasMax
=
0
,
HasSetLinear
=
0
HasSetLinear
=
0
,
HasBlend
=
1
};
};
#endif
template
<
>
struct
unpacket_traits
<
Packet2cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
2
}
;
};
template
<
>
struct
unpacket_traits
<
Packet2cf
>
{
typedef
std
::
complex
<
float
>
type
;
enum
{
size
=
2
,
alignment
=
Aligned16
};
typedef
Packet2cf
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
padd
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
_mm_add_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
psub
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
_mm_sub_ps
(
a
.
v
,
b
.
v
));
}
...
...
@@ -60,7 +67,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pmul
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
// TODO optimize it for SSE3 and 4
#ifdef EIGEN_VECTORIZE_SSE3
return
Packet2cf
(
_mm_addsub_ps
(
_mm_mul_ps
(
_mm_moveldup_ps
(
a
.
v
),
b
.
v
),
_mm_mul_ps
(
_mm_movehdup_ps
(
a
.
v
),
...
...
@@ -81,27 +87,48 @@ template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a,
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pxor
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
_mm_xor_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pandnot
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
return
Packet2cf
(
_mm_andnot_ps
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pload
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
Packet2cf
(
pload
<
Packet4f
>
(
&
real_ref
(
*
from
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploadu
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
Packet2cf
(
ploadu
<
Packet4f
>
(
&
real_ref
(
*
from
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pload
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
EIGEN_DEBUG_ALIGNED_LOAD
return
Packet2cf
(
pload
<
Packet4f
>
(
&
numext
::
real_ref
(
*
from
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploadu
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return
Packet2cf
(
ploadu
<
Packet4f
>
(
&
numext
::
real_ref
(
*
from
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pset1
<
Packet2cf
>
(
const
std
::
complex
<
float
>&
from
)
{
Packet2cf
res
;
#if EIGEN_GNUC_AT_MOST(4,2)
//
w
orkaround annoying "may be used uninitialized in this function" warning with gcc 4.2
#if EIGEN_GNUC_AT_MOST(4,2)
//
W
orkaround annoying "may be used uninitialized in this function" warning with gcc 4.2
res
.
v
=
_mm_loadl_pi
(
_mm_set1_ps
(
0.0
f
),
reinterpret_cast
<
const
__m64
*>
(
&
from
));
#else
#elif EIGEN_GNUC_AT_LEAST(4,6)
// Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wuninitialized"
res
.
v
=
_mm_loadl_pi
(
res
.
v
,
(
const
__m64
*
)
&
from
);
#endif
#pragma GCC diagnostic pop
#else
res
.
v
=
_mm_loadl_pi
(
res
.
v
,
(
const
__m64
*
)
&
from
);
#endif
return
Packet2cf
(
_mm_movelh_ps
(
res
.
v
,
res
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
ploaddup
<
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
)
{
return
pset1
<
Packet2cf
>
(
*
from
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
(
&
real_ref
(
*
to
),
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
(
&
real_ref
(
*
to
),
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
(
&
numext
::
real_ref
(
*
to
),
Packet4f
(
from
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
float
>
>
(
std
::
complex
<
float
>
*
to
,
const
Packet2cf
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
(
&
numext
::
real_ref
(
*
to
),
Packet4f
(
from
.
v
));
}
template
<
>
EIGEN_DEVICE_FUNC
inline
Packet2cf
pgather
<
std
::
complex
<
float
>
,
Packet2cf
>
(
const
std
::
complex
<
float
>*
from
,
Index
stride
)
{
return
Packet2cf
(
_mm_set_ps
(
std
::
imag
(
from
[
1
*
stride
]),
std
::
real
(
from
[
1
*
stride
]),
std
::
imag
(
from
[
0
*
stride
]),
std
::
real
(
from
[
0
*
stride
])));
}
template
<
>
EIGEN_DEVICE_FUNC
inline
void
pscatter
<
std
::
complex
<
float
>
,
Packet2cf
>
(
std
::
complex
<
float
>*
to
,
const
Packet2cf
&
from
,
Index
stride
)
{
to
[
stride
*
0
]
=
std
::
complex
<
float
>
(
_mm_cvtss_f32
(
_mm_shuffle_ps
(
from
.
v
,
from
.
v
,
0
)),
_mm_cvtss_f32
(
_mm_shuffle_ps
(
from
.
v
,
from
.
v
,
1
)));
to
[
stride
*
1
]
=
std
::
complex
<
float
>
(
_mm_cvtss_f32
(
_mm_shuffle_ps
(
from
.
v
,
from
.
v
,
2
)),
_mm_cvtss_f32
(
_mm_shuffle_ps
(
from
.
v
,
from
.
v
,
3
)));
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
float
>
>
(
const
std
::
complex
<
float
>
*
addr
)
{
_mm_prefetch
((
const
char
*
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
float
>
>
(
const
std
::
complex
<
float
>
*
addr
)
{
_mm_prefetch
((
SsePrefetchPtrType
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
pfirst
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
...
...
@@ -118,7 +145,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Pack
#endif
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
preverse
(
const
Packet2cf
&
a
)
{
return
Packet2cf
(
_mm_castpd_ps
(
preverse
(
_mm_castps_pd
(
a
.
v
))));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
preverse
(
const
Packet2cf
&
a
)
{
return
Packet2cf
(
_mm_castpd_ps
(
preverse
(
Packet2d
(
_mm_castps_pd
(
a
.
v
))))
)
;
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
float
>
predux
<
Packet2cf
>
(
const
Packet2cf
&
a
)
{
...
...
@@ -202,23 +229,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
}
};
template
<
>
struct
conj_helper
<
Packet4f
,
Packet2cf
,
false
,
false
>
{
EIGEN_STRONG_INLINE
Packet2cf
pmadd
(
const
Packet4f
&
x
,
const
Packet2cf
&
y
,
const
Packet2cf
&
c
)
const
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet2cf
pmul
(
const
Packet4f
&
x
,
const
Packet2cf
&
y
)
const
{
return
Packet2cf
(
Eigen
::
internal
::
pmul
(
x
,
y
.
v
));
}
};
template
<
>
struct
conj_helper
<
Packet2cf
,
Packet4f
,
false
,
false
>
{
EIGEN_STRONG_INLINE
Packet2cf
pmadd
(
const
Packet2cf
&
x
,
const
Packet4f
&
y
,
const
Packet2cf
&
c
)
const
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet2cf
pmul
(
const
Packet2cf
&
x
,
const
Packet4f
&
y
)
const
{
return
Packet2cf
(
Eigen
::
internal
::
pmul
(
x
.
v
,
y
));
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL
(
Packet2cf
,
Packet4f
)
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pdiv
<
Packet2cf
>
(
const
Packet2cf
&
a
,
const
Packet2cf
&
b
)
{
...
...
@@ -228,7 +239,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
return
Packet2cf
(
_mm_div_ps
(
res
.
v
,
_mm_add_ps
(
s
,
_mm_castsi128_ps
(
_mm_shuffle_epi32
(
_mm_castps_si128
(
s
),
0xb1
)))));
}
EIGEN_STRONG_INLINE
Packet2cf
pcplxflip
/*<Packet2cf>*/
(
const
Packet2cf
&
x
)
EIGEN_STRONG_INLINE
Packet2cf
pcplxflip
/*
<Packet2cf>
*/
(
const
Packet2cf
&
x
)
{
return
Packet2cf
(
vec4f_swizzle1
(
x
.
v
,
1
,
0
,
3
,
2
));
}
...
...
@@ -242,13 +253,18 @@ struct Packet1cd
__m128d
v
;
};
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
// to leverage AVX instructions.
#ifndef EIGEN_VECTORIZE_AVX
template
<
>
struct
packet_traits
<
std
::
complex
<
double
>
>
:
default_packet_traits
{
typedef
Packet1cd
type
;
typedef
Packet1cd
half
;
enum
{
Vectorizable
=
1
,
AlignedOnScalar
=
0
,
size
=
1
,
HasHalfPacket
=
0
,
HasAdd
=
1
,
HasSub
=
1
,
...
...
@@ -262,12 +278,13 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
HasSetLinear
=
0
};
};
#endif
template
<
>
struct
unpacket_traits
<
Packet1cd
>
{
typedef
std
::
complex
<
double
>
type
;
enum
{
size
=
1
}
;
};
template
<
>
struct
unpacket_traits
<
Packet1cd
>
{
typedef
std
::
complex
<
double
>
type
;
enum
{
size
=
1
,
alignment
=
Aligned16
};
typedef
Packet1cd
half
;
};
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
padd
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
_mm_add_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
psub
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
return
Packet1cd
(
_mm_sub_pd
(
a
.
v
,
b
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pnegate
(
const
Packet1cd
&
a
)
{
return
Packet1cd
(
pnegate
(
a
.
v
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pnegate
(
const
Packet1cd
&
a
)
{
return
Packet1cd
(
pnegate
(
Packet2d
(
a
.
v
))
)
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pconj
(
const
Packet1cd
&
a
)
{
const
__m128d
mask
=
_mm_castsi128_pd
(
_mm_set_epi32
(
0x80000000
,
0x0
,
0x0
,
0x0
));
...
...
@@ -276,9 +293,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pmul
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
// TODO optimize it for SSE3 and 4
#ifdef EIGEN_VECTORIZE_SSE3
return
Packet1cd
(
_mm_addsub_pd
(
_mm_mul_pd
(
vec2d_swizzle1
(
a
.
v
,
0
,
0
),
b
.
v
),
return
Packet1cd
(
_mm_addsub_pd
(
_mm_mul_pd
(
_mm_movedup_pd
(
a
.
v
),
b
.
v
),
_mm_mul_pd
(
vec2d_swizzle1
(
a
.
v
,
1
,
1
),
vec2d_swizzle1
(
b
.
v
,
1
,
0
))));
#else
...
...
@@ -305,10 +321,10 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
ploaddup
<
Packet1cd
>
(
const
std
::
complex
<
double
>*
from
)
{
return
pset1
<
Packet1cd
>
(
*
from
);
}
// FIXME force unaligned store, this is a temporary fix
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
((
double
*
)
to
,
from
.
v
);
}
template
<
>
EIGEN_STRONG_INLINE
void
pstore
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
EIGEN_DEBUG_ALIGNED_STORE
pstore
((
double
*
)
to
,
Packet2d
(
from
.
v
)
)
;
}
template
<
>
EIGEN_STRONG_INLINE
void
pstoreu
<
std
::
complex
<
double
>
>
(
std
::
complex
<
double
>
*
to
,
const
Packet1cd
&
from
)
{
EIGEN_DEBUG_UNALIGNED_STORE
pstoreu
((
double
*
)
to
,
Packet2d
(
from
.
v
)
)
;
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
double
>
>
(
const
std
::
complex
<
double
>
*
addr
)
{
_mm_prefetch
((
const
char
*
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
void
prefetch
<
std
::
complex
<
double
>
>
(
const
std
::
complex
<
double
>
*
addr
)
{
_mm_prefetch
((
SsePrefetchPtrType
)(
addr
),
_MM_HINT_T0
);
}
template
<
>
EIGEN_STRONG_INLINE
std
::
complex
<
double
>
pfirst
<
Packet1cd
>
(
const
Packet1cd
&
a
)
{
...
...
@@ -398,23 +414,7 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
}
};
template
<
>
struct
conj_helper
<
Packet2d
,
Packet1cd
,
false
,
false
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet2d
&
x
,
const
Packet1cd
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet2d
&
x
,
const
Packet1cd
&
y
)
const
{
return
Packet1cd
(
Eigen
::
internal
::
pmul
(
x
,
y
.
v
));
}
};
template
<
>
struct
conj_helper
<
Packet1cd
,
Packet2d
,
false
,
false
>
{
EIGEN_STRONG_INLINE
Packet1cd
pmadd
(
const
Packet1cd
&
x
,
const
Packet2d
&
y
,
const
Packet1cd
&
c
)
const
{
return
padd
(
c
,
pmul
(
x
,
y
));
}
EIGEN_STRONG_INLINE
Packet1cd
pmul
(
const
Packet1cd
&
x
,
const
Packet2d
&
y
)
const
{
return
Packet1cd
(
Eigen
::
internal
::
pmul
(
x
.
v
,
y
));
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL
(
Packet1cd
,
Packet2d
)
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pdiv
<
Packet1cd
>
(
const
Packet1cd
&
a
,
const
Packet1cd
&
b
)
{
...
...
@@ -424,9 +424,44 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
return
Packet1cd
(
_mm_div_pd
(
res
.
v
,
_mm_add_pd
(
s
,
_mm_shuffle_pd
(
s
,
s
,
0x1
))));
}
EIGEN_STRONG_INLINE
Packet1cd
pcplxflip
/*<Packet1cd>*/
(
const
Packet1cd
&
x
)
EIGEN_STRONG_INLINE
Packet1cd
pcplxflip
/* <Packet1cd> */
(
const
Packet1cd
&
x
)
{
return
Packet1cd
(
preverse
(
Packet2d
(
x
.
v
)));
}
EIGEN_DEVICE_FUNC
inline
void
ptranspose
(
PacketBlock
<
Packet2cf
,
2
>&
kernel
)
{
__m128d
w1
=
_mm_castps_pd
(
kernel
.
packet
[
0
].
v
);
__m128d
w2
=
_mm_castps_pd
(
kernel
.
packet
[
1
].
v
);
__m128
tmp
=
_mm_castpd_ps
(
_mm_unpackhi_pd
(
w1
,
w2
));
kernel
.
packet
[
0
].
v
=
_mm_castpd_ps
(
_mm_unpacklo_pd
(
w1
,
w2
));
kernel
.
packet
[
1
].
v
=
tmp
;
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pblend
(
const
Selector
<
2
>&
ifPacket
,
const
Packet2cf
&
thenPacket
,
const
Packet2cf
&
elsePacket
)
{
__m128d
result
=
pblend
<
Packet2d
>
(
ifPacket
,
_mm_castps_pd
(
thenPacket
.
v
),
_mm_castps_pd
(
elsePacket
.
v
));
return
Packet2cf
(
_mm_castpd_ps
(
result
));
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pinsertfirst
(
const
Packet2cf
&
a
,
std
::
complex
<
float
>
b
)
{
return
Packet2cf
(
_mm_loadl_pi
(
a
.
v
,
reinterpret_cast
<
const
__m64
*>
(
&
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pinsertfirst
(
const
Packet1cd
&
,
std
::
complex
<
double
>
b
)
{
return
pset1
<
Packet1cd
>
(
b
);
}
template
<
>
EIGEN_STRONG_INLINE
Packet2cf
pinsertlast
(
const
Packet2cf
&
a
,
std
::
complex
<
float
>
b
)
{
return
Packet2cf
(
_mm_loadh_pi
(
a
.
v
,
reinterpret_cast
<
const
__m64
*>
(
&
b
)));
}
template
<
>
EIGEN_STRONG_INLINE
Packet1cd
pinsertlast
(
const
Packet1cd
&
,
std
::
complex
<
double
>
b
)
{
return
Packet1cd
(
preverse
(
x
.
v
)
);
return
pset1
<
Packet1cd
>
(
b
);
}
}
// end namespace internal
...
...
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment