Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
d36e3414
Unverified
Commit
d36e3414
authored
Jun 01, 2020
by
peastman
Committed by
GitHub
Jun 01, 2020
Browse files
Merge pull request #2658 from mark-mb/master
Improvements for ARM64 vector implementation
parents
6b39ec7e
8e3f5bc6
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
334 additions
and
305 deletions
+334
-305
openmmapi/include/openmm/internal/vectorize8.h
openmmapi/include/openmm/internal/vectorize8.h
+40
-40
openmmapi/include/openmm/internal/vectorize_neon.h
openmmapi/include/openmm/internal/vectorize_neon.h
+99
-78
openmmapi/include/openmm/internal/vectorize_pnacl.h
openmmapi/include/openmm/internal/vectorize_pnacl.h
+67
-63
openmmapi/include/openmm/internal/vectorize_ppc.h
openmmapi/include/openmm/internal/vectorize_ppc.h
+71
-67
openmmapi/include/openmm/internal/vectorize_sse.h
openmmapi/include/openmm/internal/vectorize_sse.h
+57
-57
No files found.
openmmapi/include/openmm/internal/vectorize8.h
View file @
d36e3414
...
@@ -75,55 +75,55 @@ public:
...
@@ -75,55 +75,55 @@ public:
void
store
(
float
*
v
)
const
{
void
store
(
float
*
v
)
const
{
_mm256_storeu_ps
(
v
,
val
);
_mm256_storeu_ps
(
v
,
val
);
}
}
fvec8
operator
+
(
const
fvec8
&
other
)
const
{
fvec8
operator
+
(
fvec8
other
)
const
{
return
_mm256_add_ps
(
val
,
other
);
return
_mm256_add_ps
(
val
,
other
);
}
}
fvec8
operator
-
(
const
fvec8
&
other
)
const
{
fvec8
operator
-
(
fvec8
other
)
const
{
return
_mm256_sub_ps
(
val
,
other
);
return
_mm256_sub_ps
(
val
,
other
);
}
}
fvec8
operator
*
(
const
fvec8
&
other
)
const
{
fvec8
operator
*
(
fvec8
other
)
const
{
return
_mm256_mul_ps
(
val
,
other
);
return
_mm256_mul_ps
(
val
,
other
);
}
}
fvec8
operator
/
(
const
fvec8
&
other
)
const
{
fvec8
operator
/
(
fvec8
other
)
const
{
return
_mm256_div_ps
(
val
,
other
);
return
_mm256_div_ps
(
val
,
other
);
}
}
void
operator
+=
(
const
fvec8
&
other
)
{
void
operator
+=
(
fvec8
other
)
{
val
=
_mm256_add_ps
(
val
,
other
);
val
=
_mm256_add_ps
(
val
,
other
);
}
}
void
operator
-=
(
const
fvec8
&
other
)
{
void
operator
-=
(
fvec8
other
)
{
val
=
_mm256_sub_ps
(
val
,
other
);
val
=
_mm256_sub_ps
(
val
,
other
);
}
}
void
operator
*=
(
const
fvec8
&
other
)
{
void
operator
*=
(
fvec8
other
)
{
val
=
_mm256_mul_ps
(
val
,
other
);
val
=
_mm256_mul_ps
(
val
,
other
);
}
}
void
operator
/=
(
const
fvec8
&
other
)
{
void
operator
/=
(
fvec8
other
)
{
val
=
_mm256_div_ps
(
val
,
other
);
val
=
_mm256_div_ps
(
val
,
other
);
}
}
fvec8
operator
-
()
const
{
fvec8
operator
-
()
const
{
return
_mm256_sub_ps
(
_mm256_set1_ps
(
0.0
f
),
val
);
return
_mm256_sub_ps
(
_mm256_set1_ps
(
0.0
f
),
val
);
}
}
fvec8
operator
&
(
const
fvec8
&
other
)
const
{
fvec8
operator
&
(
fvec8
other
)
const
{
return
_mm256_and_ps
(
val
,
other
);
return
_mm256_and_ps
(
val
,
other
);
}
}
fvec8
operator
|
(
const
fvec8
&
other
)
const
{
fvec8
operator
|
(
fvec8
&
other
)
const
{
return
_mm256_or_ps
(
val
,
other
);
return
_mm256_or_ps
(
val
,
other
);
}
}
fvec8
operator
==
(
const
fvec8
&
other
)
const
{
fvec8
operator
==
(
fvec8
other
)
const
{
return
_mm256_cmp_ps
(
val
,
other
,
_CMP_EQ_OQ
);
return
_mm256_cmp_ps
(
val
,
other
,
_CMP_EQ_OQ
);
}
}
fvec8
operator
!=
(
const
fvec8
&
other
)
const
{
fvec8
operator
!=
(
fvec8
other
)
const
{
return
_mm256_cmp_ps
(
val
,
other
,
_CMP_NEQ_OQ
);
return
_mm256_cmp_ps
(
val
,
other
,
_CMP_NEQ_OQ
);
}
}
fvec8
operator
>
(
const
fvec8
&
other
)
const
{
fvec8
operator
>
(
fvec8
other
)
const
{
return
_mm256_cmp_ps
(
val
,
other
,
_CMP_GT_OQ
);
return
_mm256_cmp_ps
(
val
,
other
,
_CMP_GT_OQ
);
}
}
fvec8
operator
<
(
const
fvec8
&
other
)
const
{
fvec8
operator
<
(
fvec8
other
)
const
{
return
_mm256_cmp_ps
(
val
,
other
,
_CMP_LT_OQ
);
return
_mm256_cmp_ps
(
val
,
other
,
_CMP_LT_OQ
);
}
}
fvec8
operator
>=
(
const
fvec8
&
other
)
const
{
fvec8
operator
>=
(
fvec8
other
)
const
{
return
_mm256_cmp_ps
(
val
,
other
,
_CMP_GE_OQ
);
return
_mm256_cmp_ps
(
val
,
other
,
_CMP_GE_OQ
);
}
}
fvec8
operator
<=
(
const
fvec8
&
other
)
const
{
fvec8
operator
<=
(
fvec8
other
)
const
{
return
_mm256_cmp_ps
(
val
,
other
,
_CMP_LE_OQ
);
return
_mm256_cmp_ps
(
val
,
other
,
_CMP_LE_OQ
);
}
}
operator
ivec8
()
const
;
operator
ivec8
()
const
;
...
@@ -159,10 +159,10 @@ public:
...
@@ -159,10 +159,10 @@ public:
void
store
(
int
*
v
)
const
{
void
store
(
int
*
v
)
const
{
_mm256_storeu_si256
((
__m256i
*
)
v
,
val
);
_mm256_storeu_si256
((
__m256i
*
)
v
,
val
);
}
}
ivec8
operator
&
(
const
ivec8
&
other
)
const
{
ivec8
operator
&
(
ivec8
other
)
const
{
return
_mm256_castps_si256
(
_mm256_and_ps
(
_mm256_castsi256_ps
(
val
),
_mm256_castsi256_ps
(
other
.
val
)));
return
_mm256_castps_si256
(
_mm256_and_ps
(
_mm256_castsi256_ps
(
val
),
_mm256_castsi256_ps
(
other
.
val
)));
}
}
ivec8
operator
|
(
const
ivec8
&
other
)
const
{
ivec8
operator
|
(
ivec8
other
)
const
{
return
_mm256_castps_si256
(
_mm256_or_ps
(
_mm256_castsi256_ps
(
val
),
_mm256_castsi256_ps
(
other
.
val
)));
return
_mm256_castps_si256
(
_mm256_or_ps
(
_mm256_castsi256_ps
(
val
),
_mm256_castsi256_ps
(
other
.
val
)));
}
}
operator
fvec8
()
const
;
operator
fvec8
()
const
;
...
@@ -193,36 +193,36 @@ inline fvec8 fvec8::expandBitsToMask(int bitmask) {
...
@@ -193,36 +193,36 @@ inline fvec8 fvec8::expandBitsToMask(int bitmask) {
// Functions that operate on fvec8s.
// Functions that operate on fvec8s.
static
inline
fvec8
floor
(
const
fvec8
&
v
)
{
static
inline
fvec8
floor
(
fvec8
v
)
{
return
fvec8
(
_mm256_round_ps
(
v
.
val
,
0x09
));
return
fvec8
(
_mm256_round_ps
(
v
.
val
,
0x09
));
}
}
static
inline
fvec8
ceil
(
const
fvec8
&
v
)
{
static
inline
fvec8
ceil
(
fvec8
v
)
{
return
fvec8
(
_mm256_round_ps
(
v
.
val
,
0x0A
));
return
fvec8
(
_mm256_round_ps
(
v
.
val
,
0x0A
));
}
}
static
inline
fvec8
round
(
const
fvec8
&
v
)
{
static
inline
fvec8
round
(
fvec8
v
)
{
return
fvec8
(
_mm256_round_ps
(
v
.
val
,
_MM_FROUND_TO_NEAREST_INT
));
return
fvec8
(
_mm256_round_ps
(
v
.
val
,
_MM_FROUND_TO_NEAREST_INT
));
}
}
static
inline
fvec8
min
(
const
fvec8
&
v1
,
const
fvec8
&
v2
)
{
static
inline
fvec8
min
(
fvec8
v1
,
fvec8
v2
)
{
return
fvec8
(
_mm256_min_ps
(
v1
.
val
,
v2
.
val
));
return
fvec8
(
_mm256_min_ps
(
v1
.
val
,
v2
.
val
));
}
}
static
inline
fvec8
max
(
const
fvec8
&
v1
,
const
fvec8
&
v2
)
{
static
inline
fvec8
max
(
fvec8
v1
,
fvec8
v2
)
{
return
fvec8
(
_mm256_max_ps
(
v1
.
val
,
v2
.
val
));
return
fvec8
(
_mm256_max_ps
(
v1
.
val
,
v2
.
val
));
}
}
static
inline
fvec8
abs
(
const
fvec8
&
v
)
{
static
inline
fvec8
abs
(
fvec8
v
)
{
static
const
__m256
mask
=
_mm256_castsi256_ps
(
_mm256_set1_epi32
(
0x7FFFFFFF
));
static
const
__m256
mask
=
_mm256_castsi256_ps
(
_mm256_set1_epi32
(
0x7FFFFFFF
));
return
fvec8
(
_mm256_and_ps
(
v
.
val
,
mask
));
return
fvec8
(
_mm256_and_ps
(
v
.
val
,
mask
));
}
}
static
inline
fvec8
sqrt
(
const
fvec8
&
v
)
{
static
inline
fvec8
sqrt
(
fvec8
v
)
{
return
fvec8
(
_mm256_sqrt_ps
(
v
.
val
));
return
fvec8
(
_mm256_sqrt_ps
(
v
.
val
));
}
}
static
inline
fvec8
rsqrt
(
const
fvec8
&
v
)
{
static
inline
fvec8
rsqrt
(
fvec8
v
)
{
// Initial estimate of rsqrt().
// Initial estimate of rsqrt().
fvec8
y
(
_mm256_rsqrt_ps
(
v
.
val
));
fvec8
y
(
_mm256_rsqrt_ps
(
v
.
val
));
...
@@ -234,17 +234,17 @@ static inline fvec8 rsqrt(const fvec8& v) {
...
@@ -234,17 +234,17 @@ static inline fvec8 rsqrt(const fvec8& v) {
return
y
;
return
y
;
}
}
static
inline
float
dot8
(
const
fvec8
&
v1
,
const
fvec8
&
v2
)
{
static
inline
float
dot8
(
fvec8
v1
,
fvec8
v2
)
{
fvec8
result
=
_mm256_dp_ps
(
v1
,
v2
,
0xF1
);
fvec8
result
=
_mm256_dp_ps
(
v1
,
v2
,
0xF1
);
return
_mm_cvtss_f32
(
result
.
lowerVec
())
+
_mm_cvtss_f32
(
result
.
upperVec
());
return
_mm_cvtss_f32
(
result
.
lowerVec
())
+
_mm_cvtss_f32
(
result
.
upperVec
());
}
}
static
inline
float
reduceAdd
(
const
fvec8
v
)
{
static
inline
float
reduceAdd
(
fvec8
v
)
{
// :TODO: There are more efficient ways to do this.
// :TODO: There are more efficient ways to do this.
return
dot8
(
v
,
fvec8
(
1.0
f
));
return
dot8
(
v
,
fvec8
(
1.0
f
));
}
}
static
inline
void
transpose
(
const
fvec4
&
in1
,
const
fvec4
&
in2
,
const
fvec4
&
in3
,
const
fvec4
&
in4
,
const
fvec4
&
in5
,
const
fvec4
&
in6
,
const
fvec4
&
in7
,
const
fvec4
&
in8
,
fvec8
&
out1
,
fvec8
&
out2
,
fvec8
&
out3
,
fvec8
&
out4
)
{
static
inline
void
transpose
(
fvec4
in1
,
fvec4
in2
,
fvec4
in3
,
fvec4
in4
,
fvec4
in5
,
fvec4
in6
,
fvec4
in7
,
fvec4
in8
,
fvec8
&
out1
,
fvec8
&
out2
,
fvec8
&
out3
,
fvec8
&
out4
)
{
fvec4
i1
=
in1
,
i2
=
in2
,
i3
=
in3
,
i4
=
in4
;
fvec4
i1
=
in1
,
i2
=
in2
,
i3
=
in3
,
i4
=
in4
;
fvec4
i5
=
in5
,
i6
=
in6
,
i7
=
in7
,
i8
=
in8
;
fvec4
i5
=
in5
,
i6
=
in6
,
i7
=
in7
,
i8
=
in8
;
_MM_TRANSPOSE4_PS
(
i1
,
i2
,
i3
,
i4
);
_MM_TRANSPOSE4_PS
(
i1
,
i2
,
i3
,
i4
);
...
@@ -275,7 +275,7 @@ static inline void transpose(const fvec4 in[8], fvec8& out1, fvec8& out2, fvec8&
...
@@ -275,7 +275,7 @@ static inline void transpose(const fvec4 in[8], fvec8& out1, fvec8& out2, fvec8&
transpose
(
in
[
0
],
in
[
1
],
in
[
2
],
in
[
3
],
in
[
4
],
in
[
5
],
in
[
6
],
in
[
7
],
out1
,
out2
,
out3
,
out4
);
transpose
(
in
[
0
],
in
[
1
],
in
[
2
],
in
[
3
],
in
[
4
],
in
[
5
],
in
[
6
],
in
[
7
],
out1
,
out2
,
out3
,
out4
);
}
}
static
inline
void
transpose
(
const
fvec8
&
in1
,
const
fvec8
&
in2
,
const
fvec8
&
in3
,
const
fvec8
&
in4
,
fvec4
&
out1
,
fvec4
&
out2
,
fvec4
&
out3
,
fvec4
&
out4
,
fvec4
&
out5
,
fvec4
&
out6
,
fvec4
&
out7
,
fvec4
&
out8
)
{
static
inline
void
transpose
(
fvec8
in1
,
fvec8
in2
,
fvec8
in3
,
fvec8
in4
,
fvec4
&
out1
,
fvec4
&
out2
,
fvec4
&
out3
,
fvec4
&
out4
,
fvec4
&
out5
,
fvec4
&
out6
,
fvec4
&
out7
,
fvec4
&
out8
)
{
out1
=
in1
.
lowerVec
();
out1
=
in1
.
lowerVec
();
out2
=
in2
.
lowerVec
();
out2
=
in2
.
lowerVec
();
out3
=
in3
.
lowerVec
();
out3
=
in3
.
lowerVec
();
...
@@ -291,40 +291,40 @@ static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in
...
@@ -291,40 +291,40 @@ static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in
/**
/**
* Given 4 input vectors of 8 elements, transpose them to form 8 output vectors of 4 elements.
* Given 4 input vectors of 8 elements, transpose them to form 8 output vectors of 4 elements.
*/
*/
static
inline
void
transpose
(
const
fvec8
&
in1
,
const
fvec8
&
in2
,
const
fvec8
&
in3
,
const
fvec8
&
in4
,
fvec4
out
[
8
])
{
static
inline
void
transpose
(
fvec8
in1
,
fvec8
in2
,
fvec8
in3
,
fvec8
in4
,
fvec4
out
[
8
])
{
transpose
(
in1
,
in2
,
in3
,
in4
,
out
[
0
],
out
[
1
],
out
[
2
],
out
[
3
],
out
[
4
],
out
[
5
],
out
[
6
],
out
[
7
]);
transpose
(
in1
,
in2
,
in3
,
in4
,
out
[
0
],
out
[
1
],
out
[
2
],
out
[
3
],
out
[
4
],
out
[
5
],
out
[
6
],
out
[
7
]);
}
}
// Functions that operate on ivec8s.
// Functions that operate on ivec8s.
static
inline
bool
any
(
const
ivec8
&
v
)
{
static
inline
bool
any
(
ivec8
v
)
{
return
!
_mm256_testz_si256
(
v
,
_mm256_set1_epi32
(
0xFFFFFFFF
));
return
!
_mm256_testz_si256
(
v
,
_mm256_set1_epi32
(
0xFFFFFFFF
));
}
}
// Mathematical operators involving a scalar and a vector.
// Mathematical operators involving a scalar and a vector.
static
inline
fvec8
operator
+
(
float
v1
,
const
fvec8
&
v2
)
{
static
inline
fvec8
operator
+
(
float
v1
,
fvec8
v2
)
{
return
fvec8
(
v1
)
+
v2
;
return
fvec8
(
v1
)
+
v2
;
}
}
static
inline
fvec8
operator
-
(
float
v1
,
const
fvec8
&
v2
)
{
static
inline
fvec8
operator
-
(
float
v1
,
fvec8
v2
)
{
return
fvec8
(
v1
)
-
v2
;
return
fvec8
(
v1
)
-
v2
;
}
}
static
inline
fvec8
operator
*
(
float
v1
,
const
fvec8
&
v2
)
{
static
inline
fvec8
operator
*
(
float
v1
,
fvec8
v2
)
{
return
fvec8
(
v1
)
*
v2
;
return
fvec8
(
v1
)
*
v2
;
}
}
static
inline
fvec8
operator
/
(
float
v1
,
const
fvec8
&
v2
)
{
static
inline
fvec8
operator
/
(
float
v1
,
fvec8
v2
)
{
return
fvec8
(
v1
)
/
v2
;
return
fvec8
(
v1
)
/
v2
;
}
}
// Operation for blending fvec8 from a full bitmask.
// Operation for blending fvec8 from a full bitmask.
static
inline
fvec8
blend
(
const
fvec8
&
v1
,
const
fvec8
&
v2
,
const
fvec8
&
mask
)
{
static
inline
fvec8
blend
(
fvec8
v1
,
fvec8
v2
,
fvec8
mask
)
{
return
fvec8
(
_mm256_blendv_ps
(
v1
.
val
,
v2
.
val
,
mask
.
val
));
return
fvec8
(
_mm256_blendv_ps
(
v1
.
val
,
v2
.
val
,
mask
.
val
));
}
}
static
inline
fvec8
blendZero
(
const
fvec8
v
,
const
fvec8
mask
)
{
static
inline
fvec8
blendZero
(
fvec8
v
,
fvec8
mask
)
{
return
blend
(
0.0
f
,
v
,
mask
);
return
blend
(
0.0
f
,
v
,
mask
);
}
}
...
@@ -333,7 +333,7 @@ static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) {
...
@@ -333,7 +333,7 @@ static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) {
* of vectors. The first result vector contains the values at the given indexes, and the second
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
* result vector contains the values from each respective index+1.
*/
*/
static
inline
void
gatherVecPair
(
const
float
*
table
,
const
ivec8
index
,
fvec8
&
out0
,
fvec8
&
out1
)
{
static
inline
void
gatherVecPair
(
const
float
*
table
,
ivec8
index
,
fvec8
&
out0
,
fvec8
&
out1
)
{
const
auto
lower
=
index
.
lowerVec
();
const
auto
lower
=
index
.
lowerVec
();
const
auto
upper
=
index
.
upperVec
();
const
auto
upper
=
index
.
upperVec
();
...
@@ -368,7 +368,7 @@ static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& o
...
@@ -368,7 +368,7 @@ static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& o
* output[2] = (Z0 + Z1 + Z2 + ...)
* output[2] = (Z0 + Z1 + Z2 + ...)
* output[3] = undefined
* output[3] = undefined
*/
*/
static
inline
fvec4
reduceToVec3
(
const
fvec8
x
,
const
fvec8
y
,
const
fvec8
z
)
{
static
inline
fvec4
reduceToVec3
(
fvec8
x
,
fvec8
y
,
fvec8
z
)
{
// The general strategy for a vector reduce-add operation is to take values from
// The general strategy for a vector reduce-add operation is to take values from
// different parts of the vector and overlap them a different part of the vector and then
// different parts of the vector and overlap them a different part of the vector and then
// add together. Repeat this several times until all values have been summed. Initially 8
// add together. Repeat this several times until all values have been summed. Initially 8
...
...
openmmapi/include/openmm/internal/vectorize_neon.h
View file @
d36e3414
...
@@ -76,10 +76,7 @@ public:
...
@@ -76,10 +76,7 @@ public:
fvec4
()
=
default
;
fvec4
()
=
default
;
fvec4
(
float
v
)
:
val
(
vdupq_n_f32
(
v
))
{}
fvec4
(
float
v
)
:
val
(
vdupq_n_f32
(
v
))
{}
fvec4
(
float
v1
,
float
v2
,
float
v3
,
float
v4
)
{
fvec4
(
float
v1
,
float
v2
,
float
v3
,
float
v4
)
:
val
{
v1
,
v2
,
v3
,
v4
}
{}
float
v
[]
=
{
v1
,
v2
,
v3
,
v4
};
val
=
vld1q_f32
(
v
);
}
fvec4
(
float32x4_t
v
)
:
val
(
v
)
{}
fvec4
(
float32x4_t
v
)
:
val
(
v
)
{}
fvec4
(
const
float
*
v
)
:
val
(
vld1q_f32
(
v
))
{}
fvec4
(
const
float
*
v
)
:
val
(
vld1q_f32
(
v
))
{}
operator
float32x4_t
()
const
{
operator
float32x4_t
()
const
{
...
@@ -121,16 +118,16 @@ public:
...
@@ -121,16 +118,16 @@ public:
v
[
2
]
=
vgetq_lane_f32
(
val
,
2
);
v
[
2
]
=
vgetq_lane_f32
(
val
,
2
);
}
}
fvec4
operator
+
(
const
fvec4
&
other
)
const
{
fvec4
operator
+
(
fvec4
other
)
const
{
return
vaddq_f32
(
val
,
other
);
return
vaddq_f32
(
val
,
other
);
}
}
fvec4
operator
-
(
const
fvec4
&
other
)
const
{
fvec4
operator
-
(
fvec4
other
)
const
{
return
vsubq_f32
(
val
,
other
);
return
vsubq_f32
(
val
,
other
);
}
}
fvec4
operator
*
(
const
fvec4
&
other
)
const
{
fvec4
operator
*
(
fvec4
other
)
const
{
return
vmulq_f32
(
val
,
other
);
return
vmulq_f32
(
val
,
other
);
}
}
fvec4
operator
/
(
const
fvec4
&
other
)
const
{
fvec4
operator
/
(
fvec4
other
)
const
{
// NEON does not have a divide float-point operator, so we get the reciprocal and multiply.
// NEON does not have a divide float-point operator, so we get the reciprocal and multiply.
float32x4_t
reciprocal
=
vrecpeq_f32
(
other
);
float32x4_t
reciprocal
=
vrecpeq_f32
(
other
);
...
@@ -139,45 +136,34 @@ public:
...
@@ -139,45 +136,34 @@ public:
fvec4
result
=
vmulq_f32
(
val
,
reciprocal
);
fvec4
result
=
vmulq_f32
(
val
,
reciprocal
);
return
result
;
return
result
;
}
}
void
operator
+=
(
const
fvec4
&
other
)
{
void
operator
+=
(
fvec4
other
)
{
val
=
vaddq_f32
(
val
,
other
);
val
=
vaddq_f32
(
val
,
other
);
}
}
void
operator
-=
(
const
fvec4
&
other
)
{
void
operator
-=
(
fvec4
other
)
{
val
=
vsubq_f32
(
val
,
other
);
val
=
vsubq_f32
(
val
,
other
);
}
}
void
operator
*=
(
const
fvec4
&
other
)
{
void
operator
*=
(
fvec4
other
)
{
val
=
vmulq_f32
(
val
,
other
);
val
=
vmulq_f32
(
val
,
other
);
}
}
void
operator
/=
(
const
fvec4
&
other
)
{
void
operator
/=
(
fvec4
other
)
{
val
=
*
this
/
other
;
val
=
*
this
/
other
;
}
}
fvec4
operator
-
()
const
{
fvec4
operator
-
()
const
{
return
vnegq_f32
(
val
);
return
vnegq_f32
(
val
);
}
}
fvec4
operator
&
(
const
fvec4
&
other
)
const
{
fvec4
operator
&
(
fvec4
other
)
const
{
return
vreinterpretq_f32_u32
(
vandq_u32
(
vreinterpretq_u32_f32
(
val
),
vreinterpretq_u32_f32
(
other
)));
return
vreinterpretq_f32_u32
(
vandq_u32
(
vreinterpretq_u32_f32
(
val
),
vreinterpretq_u32_f32
(
other
)));
}
}
fvec4
operator
|
(
const
fvec4
&
other
)
const
{
fvec4
operator
|
(
fvec4
other
)
const
{
return
vreinterpretq_f32_u32
(
vorrq_u32
(
vreinterpretq_u32_f32
(
val
),
vreinterpretq_u32_f32
(
other
)));
return
vreinterpretq_f32_u32
(
vorrq_u32
(
vreinterpretq_u32_f32
(
val
),
vreinterpretq_u32_f32
(
other
)));
}
}
fvec4
operator
==
(
const
fvec4
&
other
)
const
{
return
vcvtq_f32_s32
(
vreinterpretq_s32_u32
(
vceqq_f32
(
val
,
other
)));
ivec4
operator
==
(
fvec4
other
)
const
;
}
ivec4
operator
!=
(
fvec4
other
)
const
;
fvec4
operator
!=
(
const
fvec4
&
other
)
const
{
ivec4
operator
>
(
fvec4
other
)
const
;
return
vcvtq_f32_s32
(
vreinterpretq_s32_u32
(
vmvnq_u32
(
vceqq_f32
(
val
,
other
))));
// not(equals(val, other))
ivec4
operator
<
(
fvec4
other
)
const
;
}
ivec4
operator
>=
(
fvec4
other
)
const
;
fvec4
operator
>
(
const
fvec4
&
other
)
const
{
ivec4
operator
<=
(
fvec4
other
)
const
;
return
vcvtq_f32_s32
(
vreinterpretq_s32_u32
(
vcgtq_f32
(
val
,
other
)));
}
fvec4
operator
<
(
const
fvec4
&
other
)
const
{
return
vcvtq_f32_s32
(
vreinterpretq_s32_u32
(
vcltq_f32
(
val
,
other
)));
}
fvec4
operator
>=
(
const
fvec4
&
other
)
const
{
return
vcvtq_f32_s32
(
vreinterpretq_s32_u32
(
vcgeq_f32
(
val
,
other
)));
}
fvec4
operator
<=
(
const
fvec4
&
other
)
const
{
return
vcvtq_f32_s32
(
vreinterpretq_s32_u32
(
vcleq_f32
(
val
,
other
)));
}
operator
ivec4
()
const
;
operator
ivec4
()
const
;
/**
/**
...
@@ -198,10 +184,7 @@ public:
...
@@ -198,10 +184,7 @@ public:
ivec4
()
{}
ivec4
()
{}
ivec4
(
int
v
)
:
val
(
vdupq_n_s32
(
v
))
{}
ivec4
(
int
v
)
:
val
(
vdupq_n_s32
(
v
))
{}
ivec4
(
int
v1
,
int
v2
,
int
v3
,
int
v4
)
{
ivec4
(
int
v1
,
int
v2
,
int
v3
,
int
v4
)
:
val
{
v1
,
v2
,
v3
,
v4
}
{}
int
v
[]
=
{
v1
,
v2
,
v3
,
v4
};
val
=
vld1q_s32
(
v
);
}
ivec4
(
int32x4_t
v
)
:
val
(
v
)
{}
ivec4
(
int32x4_t
v
)
:
val
(
v
)
{}
ivec4
(
const
int
*
v
)
:
val
(
vld1q_s32
(
v
))
{}
ivec4
(
const
int
*
v
)
:
val
(
vld1q_s32
(
v
))
{}
operator
int32x4_t
()
const
{
operator
int32x4_t
()
const
{
...
@@ -223,49 +206,49 @@ public:
...
@@ -223,49 +206,49 @@ public:
void
store
(
int
*
v
)
const
{
void
store
(
int
*
v
)
const
{
vst1q_s32
(
v
,
val
);
vst1q_s32
(
v
,
val
);
}
}
ivec4
operator
+
(
const
ivec4
&
other
)
const
{
ivec4
operator
+
(
ivec4
other
)
const
{
return
vaddq_s32
(
val
,
other
);
return
vaddq_s32
(
val
,
other
);
}
}
ivec4
operator
-
(
const
ivec4
&
other
)
const
{
ivec4
operator
-
(
ivec4
other
)
const
{
return
vsubq_s32
(
val
,
other
);
return
vsubq_s32
(
val
,
other
);
}
}
ivec4
operator
*
(
const
ivec4
&
other
)
const
{
ivec4
operator
*
(
ivec4
other
)
const
{
return
vmulq_s32
(
val
,
other
);
return
vmulq_s32
(
val
,
other
);
}
}
void
operator
+=
(
const
ivec4
&
other
)
{
void
operator
+=
(
ivec4
other
)
{
val
=
vaddq_s32
(
val
,
other
);
val
=
vaddq_s32
(
val
,
other
);
}
}
void
operator
-=
(
const
ivec4
&
other
)
{
void
operator
-=
(
ivec4
other
)
{
val
=
vsubq_s32
(
val
,
other
);
val
=
vsubq_s32
(
val
,
other
);
}
}
void
operator
*=
(
const
ivec4
&
other
)
{
void
operator
*=
(
ivec4
other
)
{
val
=
vmulq_s32
(
val
,
other
);
val
=
vmulq_s32
(
val
,
other
);
}
}
ivec4
operator
-
()
const
{
ivec4
operator
-
()
const
{
return
vnegq_s32
(
val
);
return
vnegq_s32
(
val
);
}
}
ivec4
operator
&
(
const
ivec4
&
other
)
const
{
ivec4
operator
&
(
ivec4
other
)
const
{
return
vandq_s32
(
val
,
other
);
return
vandq_s32
(
val
,
other
);
}
}
ivec4
operator
|
(
const
ivec4
&
other
)
const
{
ivec4
operator
|
(
ivec4
other
)
const
{
return
vorrq_s32
(
val
,
other
);
return
vorrq_s32
(
val
,
other
);
}
}
ivec4
operator
==
(
const
ivec4
&
other
)
const
{
ivec4
operator
==
(
ivec4
other
)
const
{
return
vreinterpretq_s32_u32
(
vceqq_s32
(
val
,
other
));
return
vreinterpretq_s32_u32
(
vceqq_s32
(
val
,
other
));
}
}
ivec4
operator
!=
(
const
ivec4
&
other
)
const
{
ivec4
operator
!=
(
ivec4
other
)
const
{
return
vreinterpretq_s32_u32
(
vmvnq_u32
(
vceqq_s32
(
val
,
other
)));
// not(equal(val, other))
return
vreinterpretq_s32_u32
(
vmvnq_u32
(
vceqq_s32
(
val
,
other
)));
// not(equal(val, other))
}
}
ivec4
operator
>
(
const
ivec4
&
other
)
const
{
ivec4
operator
>
(
ivec4
other
)
const
{
return
vreinterpretq_s32_u32
(
vcgtq_s32
(
val
,
other
));
return
vreinterpretq_s32_u32
(
vcgtq_s32
(
val
,
other
));
}
}
ivec4
operator
<
(
const
ivec4
&
other
)
const
{
ivec4
operator
<
(
ivec4
other
)
const
{
return
vreinterpretq_s32_u32
(
vcltq_s32
(
val
,
other
));
return
vreinterpretq_s32_u32
(
vcltq_s32
(
val
,
other
));
}
}
ivec4
operator
>=
(
const
ivec4
&
other
)
const
{
ivec4
operator
>=
(
ivec4
other
)
const
{
return
vreinterpretq_s32_u32
(
vcgeq_s32
(
val
,
other
));
return
vreinterpretq_s32_u32
(
vcgeq_s32
(
val
,
other
));
}
}
ivec4
operator
<=
(
const
ivec4
&
other
)
const
{
ivec4
operator
<=
(
ivec4
other
)
const
{
return
vreinterpretq_s32_u32
(
vcleq_s32
(
val
,
other
));
return
vreinterpretq_s32_u32
(
vcleq_s32
(
val
,
other
));
}
}
operator
fvec4
()
const
;
operator
fvec4
()
const
;
...
@@ -287,54 +270,84 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
...
@@ -287,54 +270,84 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
bitmask
&
4
?
-
1
:
0
,
bitmask
&
4
?
-
1
:
0
,
bitmask
&
8
?
-
1
:
0
);
bitmask
&
8
?
-
1
:
0
);
}
}
// Comparison operators
inline
ivec4
fvec4
::
operator
==
(
fvec4
other
)
const
{
return
vreinterpretq_s32_u32
(
vceqq_f32
(
val
,
other
));
}
inline
ivec4
fvec4
::
operator
!=
(
fvec4
other
)
const
{
return
vreinterpretq_s32_u32
(
vmvnq_u32
(
vceqq_f32
(
val
,
other
)));
// not(equals(val, other))
}
inline
ivec4
fvec4
::
operator
>
(
fvec4
other
)
const
{
return
vreinterpretq_s32_u32
(
vcgtq_f32
(
val
,
other
));
}
inline
ivec4
fvec4
::
operator
<
(
fvec4
other
)
const
{
return
vreinterpretq_s32_u32
(
vcltq_f32
(
val
,
other
));
}
inline
ivec4
fvec4
::
operator
>=
(
fvec4
other
)
const
{
return
vreinterpretq_s32_u32
(
vcgeq_f32
(
val
,
other
));
}
inline
ivec4
fvec4
::
operator
<=
(
fvec4
other
)
const
{
return
vreinterpretq_s32_u32
(
vcleq_f32
(
val
,
other
));
}
// Functions that operate on fvec4s.
// Functions that operate on fvec4s.
static
inline
fvec4
min
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
min
(
fvec4
v1
,
fvec4
v2
)
{
return
vminq_f32
(
v1
,
v2
);
return
vminq_f32
(
v1
,
v2
);
}
}
static
inline
fvec4
max
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
max
(
fvec4
v1
,
fvec4
v2
)
{
return
vmaxq_f32
(
v1
,
v2
);
return
vmaxq_f32
(
v1
,
v2
);
}
}
static
inline
fvec4
abs
(
const
fvec4
&
v
)
{
static
inline
fvec4
abs
(
fvec4
v
)
{
return
vabsq_f32
(
v
);
return
vabsq_f32
(
v
);
}
}
static
inline
fvec4
rsqrt
(
const
fvec4
&
v
)
{
static
inline
fvec4
rsqrt
(
fvec4
v
)
{
float32x4_t
recipSqrt
=
vrsqrteq_f32
(
v
);
float32x4_t
recipSqrt
=
vrsqrteq_f32
(
v
);
recipSqrt
=
vmulq_f32
(
recipSqrt
,
vrsqrtsq_f32
(
vmulq_f32
(
recipSqrt
,
v
),
recipSqrt
));
recipSqrt
=
vmulq_f32
(
recipSqrt
,
vrsqrtsq_f32
(
vmulq_f32
(
recipSqrt
,
v
),
recipSqrt
));
recipSqrt
=
vmulq_f32
(
recipSqrt
,
vrsqrtsq_f32
(
vmulq_f32
(
recipSqrt
,
v
),
recipSqrt
));
recipSqrt
=
vmulq_f32
(
recipSqrt
,
vrsqrtsq_f32
(
vmulq_f32
(
recipSqrt
,
v
),
recipSqrt
));
return
recipSqrt
;
return
recipSqrt
;
}
}
static
inline
fvec4
sqrt
(
const
fvec4
&
v
)
{
static
inline
fvec4
sqrt
(
fvec4
v
)
{
return
rsqrt
(
v
)
*
v
;
return
rsqrt
(
v
)
*
v
;
}
}
static
inline
fvec4
exp
(
const
fvec4
&
v
)
{
static
inline
fvec4
exp
(
fvec4
v
)
{
return
fvec4
(
exp_ps
(
v
.
val
));
return
fvec4
(
exp_ps
(
v
.
val
));
}
}
static
inline
fvec4
log
(
const
fvec4
&
v
)
{
static
inline
fvec4
log
(
fvec4
v
)
{
return
fvec4
(
log_ps
(
v
.
val
));
return
fvec4
(
log_ps
(
v
.
val
));
}
}
static
inline
float
dot3
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
float
dot3
(
fvec4
v1
,
fvec4
v2
)
{
fvec4
result
=
v1
*
v2
;
fvec4
result
=
v1
*
v2
;
return
vgetq_lane_f32
(
result
,
0
)
+
vgetq_lane_f32
(
result
,
1
)
+
vgetq_lane_f32
(
result
,
2
);
return
vgetq_lane_f32
(
result
,
0
)
+
vgetq_lane_f32
(
result
,
1
)
+
vgetq_lane_f32
(
result
,
2
);
}
}
static
inline
float
dot4
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
float
dot4
(
fvec4
v1
,
fvec4
v2
)
{
fvec4
result
=
v1
*
v2
;
fvec4
result
=
v1
*
v2
;
return
vgetq_lane_f32
(
result
,
0
)
+
vgetq_lane_f32
(
result
,
1
)
+
vgetq_lane_f32
(
result
,
2
)
+
vgetq_lane_f32
(
result
,
3
);
return
vgetq_lane_f32
(
result
,
0
)
+
vgetq_lane_f32
(
result
,
1
)
+
vgetq_lane_f32
(
result
,
2
)
+
vgetq_lane_f32
(
result
,
3
);
}
}
static
inline
float
reduceAdd
(
const
fvec4
v
)
{
static
inline
float
reduceAdd
(
fvec4
v
)
{
#ifdef __ARM64__
return
vaddvq_f32
(
v
);
#else
return
dot4
(
v
,
fvec4
(
1.0
f
));
return
dot4
(
v
,
fvec4
(
1.0
f
));
#endif
}
}
static
inline
fvec4
cross
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
cross
(
fvec4
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
[
1
]
*
v2
[
2
]
-
v1
[
2
]
*
v2
[
1
],
return
fvec4
(
v1
[
1
]
*
v2
[
2
]
-
v1
[
2
]
*
v2
[
1
],
v1
[
2
]
*
v2
[
0
]
-
v1
[
0
]
*
v2
[
2
],
v1
[
2
]
*
v2
[
0
]
-
v1
[
0
]
*
v2
[
2
],
v1
[
0
]
*
v2
[
1
]
-
v1
[
1
]
*
v2
[
0
],
0
);
v1
[
0
]
*
v2
[
1
]
-
v1
[
1
]
*
v2
[
0
],
0
);
...
@@ -362,71 +375,79 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
...
@@ -362,71 +375,79 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
/**
/**
* Out-of-place transpose from named variables into an array.
* Out-of-place transpose from named variables into an array.
*/
*/
static
inline
void
transpose
(
const
fvec4
v0
,
const
fvec4
v1
,
const
fvec4
v2
,
const
fvec4
v3
,
fvec4
out
[
4
])
{
static
inline
void
transpose
(
fvec4
v0
,
fvec4
v1
,
fvec4
v2
,
fvec4
v3
,
fvec4
out
[
4
])
{
out
[
0
]
=
v0
;
out
[
1
]
=
v1
;
out
[
2
]
=
v2
;
out
[
3
]
=
v3
;
out
[
0
]
=
v0
;
out
[
1
]
=
v1
;
out
[
2
]
=
v2
;
out
[
3
]
=
v3
;
transpose
(
out
[
0
],
out
[
1
],
out
[
2
],
out
[
3
]);
transpose
(
out
[
0
],
out
[
1
],
out
[
2
],
out
[
3
]);
}
}
// Functions that operate on ivec4s.
// Functions that operate on ivec4s.
static
inline
ivec4
min
(
const
ivec4
&
v1
,
const
ivec4
&
v2
)
{
static
inline
ivec4
min
(
ivec4
v1
,
ivec4
v2
)
{
return
vminq_s32
(
v1
,
v2
);
return
vminq_s32
(
v1
,
v2
);
}
}
static
inline
ivec4
max
(
const
ivec4
&
v1
,
const
ivec4
&
v2
)
{
static
inline
ivec4
max
(
ivec4
v1
,
ivec4
v2
)
{
return
vmaxq_s32
(
v1
,
v2
);
return
vmaxq_s32
(
v1
,
v2
);
}
}
static
inline
ivec4
abs
(
const
ivec4
&
v
)
{
static
inline
ivec4
abs
(
ivec4
v
)
{
return
vabdq_s32
(
v
,
ivec4
(
0
));
return
vabdq_s32
(
v
,
ivec4
(
0
));
}
}
static
inline
bool
any
(
const
ivec4
&
v
)
{
static
inline
bool
any
(
ivec4
v
)
{
#ifdef __ARM64__
return
(
vmaxvq_u32
(
vreinterpretq_u32_s32
(
v
))
!=
0
);
#else
return
(
vgetq_lane_s32
(
v
,
0
)
!=
0
||
vgetq_lane_s32
(
v
,
1
)
!=
0
||
vgetq_lane_s32
(
v
,
2
)
!=
0
||
vgetq_lane_s32
(
v
,
3
)
!=
0
);
return
(
vgetq_lane_s32
(
v
,
0
)
!=
0
||
vgetq_lane_s32
(
v
,
1
)
!=
0
||
vgetq_lane_s32
(
v
,
2
)
!=
0
||
vgetq_lane_s32
(
v
,
3
)
!=
0
);
#endif
}
}
// Mathematical operators involving a scalar and a vector.
// Mathematical operators involving a scalar and a vector.
static
inline
fvec4
operator
+
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
+
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
+
v2
;
return
fvec4
(
v1
)
+
v2
;
}
}
static
inline
fvec4
operator
-
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
-
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
-
v2
;
return
fvec4
(
v1
)
-
v2
;
}
}
static
inline
fvec4
operator
*
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
*
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
*
v2
;
return
fvec4
(
v1
)
*
v2
;
}
}
static
inline
fvec4
operator
/
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
/
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
/
v2
;
return
fvec4
(
v1
)
/
v2
;
}
}
// Operations for blending fvec4s based on an ivec4.
// Operations for blending fvec4s based on an ivec4.
static
inline
fvec4
blend
(
const
fvec4
&
v1
,
const
fvec4
&
v2
,
const
ivec4
&
mask
)
{
static
inline
fvec4
blend
(
fvec4
v1
,
fvec4
v2
,
ivec4
mask
)
{
return
vbslq_f32
(
vreinterpretq_u32_s32
(
mask
),
v2
,
v1
);
return
vbslq_f32
(
vreinterpretq_u32_s32
(
mask
),
v2
,
v1
);
}
}
static
inline
fvec4
blendZero
(
const
fvec4
v
,
const
ivec4
mask
)
{
static
inline
fvec4
blendZero
(
fvec4
v
,
ivec4
mask
)
{
return
blend
(
0.0
f
,
v
,
mask
);
return
vreinterpretq_f32_s32
(
vandq_s32
(
vreinterpretq_s32_f32
(
v
),
mask
));
}
static
inline
ivec4
blendZero
(
ivec4
v
,
ivec4
mask
)
{
return
v
&
mask
;
}
}
// These are at the end since they involve other functions defined above.
// These are at the end since they involve other functions defined above.
static
inline
fvec4
round
(
const
fvec4
&
v
)
{
static
inline
fvec4
round
(
fvec4
v
)
{
fvec4
shift
(
0x1
.0
p23f
);
fvec4
shift
(
0x1
.0
p23f
);
fvec4
absResult
=
(
abs
(
v
)
+
shift
)
-
shift
;
fvec4
absResult
=
(
abs
(
v
)
+
shift
)
-
shift
;
return
blend
(
v
,
absResult
,
ivec4
(
0x7FFFFFFF
));
return
blend
(
v
,
absResult
,
ivec4
(
0x7FFFFFFF
));
}
}
static
inline
fvec4
floor
(
const
fvec4
&
v
)
{
static
inline
fvec4
floor
(
fvec4
v
)
{
fvec4
rounded
=
round
(
v
);
fvec4
rounded
=
round
(
v
);
return
rounded
+
blend
(
0.0
f
,
-
1.0
f
,
rounded
>
v
);
return
rounded
+
blend
(
0.0
f
,
-
1.0
f
,
rounded
>
v
);
}
}
static
inline
fvec4
ceil
(
const
fvec4
&
v
)
{
static
inline
fvec4
ceil
(
fvec4
v
)
{
fvec4
rounded
=
round
(
v
);
fvec4
rounded
=
round
(
v
);
return
rounded
+
blend
(
0.0
f
,
1.0
f
,
rounded
<
v
);
return
rounded
+
blend
(
0.0
f
,
1.0
f
,
rounded
<
v
);
}
}
...
@@ -435,7 +456,7 @@ static inline fvec4 ceil(const fvec4& v) {
...
@@ -435,7 +456,7 @@ static inline fvec4 ceil(const fvec4& v) {
* of vectors. The first result vector contains the values at the given indexes, and the second
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
* result vector contains the values from each respective index+1.
*/
*/
static
inline
void
gatherVecPair
(
const
float
*
table
,
const
ivec4
index
,
fvec4
&
out0
,
fvec4
&
out1
)
{
static
inline
void
gatherVecPair
(
const
float
*
table
,
ivec4
index
,
fvec4
&
out0
,
fvec4
&
out1
)
{
fvec4
t0
(
table
+
index
[
0
]);
fvec4
t0
(
table
+
index
[
0
]);
fvec4
t1
(
table
+
index
[
1
]);
fvec4
t1
(
table
+
index
[
1
]);
fvec4
t2
(
table
+
index
[
2
]);
fvec4
t2
(
table
+
index
[
2
]);
...
@@ -458,7 +479,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
...
@@ -458,7 +479,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined
* output[3] = undefined
*/
*/
static
inline
fvec4
reduceToVec3
(
const
fvec4
x
,
const
fvec4
y
,
const
fvec4
z
)
{
static
inline
fvec4
reduceToVec3
(
fvec4
x
,
fvec4
y
,
fvec4
z
)
{
const
auto
nx
=
reduceAdd
(
x
);
const
auto
nx
=
reduceAdd
(
x
);
const
auto
ny
=
reduceAdd
(
y
);
const
auto
ny
=
reduceAdd
(
y
);
const
auto
nz
=
reduceAdd
(
z
);
const
auto
nz
=
reduceAdd
(
z
);
...
...
openmmapi/include/openmm/internal/vectorize_pnacl.h
View file @
d36e3414
...
@@ -95,45 +95,45 @@ public:
...
@@ -95,45 +95,45 @@ public:
v
[
1
]
=
val
[
1
];
v
[
1
]
=
val
[
1
];
v
[
2
]
=
val
[
2
];
v
[
2
]
=
val
[
2
];
}
}
fvec4
operator
+
(
const
fvec4
&
other
)
const
{
fvec4
operator
+
(
fvec4
other
)
const
{
return
val
+
other
;
return
val
+
other
;
}
}
fvec4
operator
-
(
const
fvec4
&
other
)
const
{
fvec4
operator
-
(
fvec4
other
)
const
{
return
val
-
other
;
return
val
-
other
;
}
}
fvec4
operator
*
(
const
fvec4
&
other
)
const
{
fvec4
operator
*
(
fvec4
other
)
const
{
return
val
*
other
;
return
val
*
other
;
}
}
fvec4
operator
/
(
const
fvec4
&
other
)
const
{
fvec4
operator
/
(
fvec4
other
)
const
{
return
val
/
other
;
return
val
/
other
;
}
}
void
operator
+=
(
const
fvec4
&
other
)
{
void
operator
+=
(
fvec4
other
)
{
val
=
val
+
other
;
val
=
val
+
other
;
}
}
void
operator
-=
(
const
fvec4
&
other
)
{
void
operator
-=
(
fvec4
other
)
{
val
=
val
-
other
;
val
=
val
-
other
;
}
}
void
operator
*=
(
const
fvec4
&
other
)
{
void
operator
*=
(
fvec4
other
)
{
val
=
val
*
other
;
val
=
val
*
other
;
}
}
void
operator
/=
(
const
fvec4
&
other
)
{
void
operator
/=
(
fvec4
other
)
{
val
=
val
/
other
;
val
=
val
/
other
;
}
}
fvec4
operator
-
()
const
{
fvec4
operator
-
()
const
{
return
-
val
;
return
-
val
;
}
}
fvec4
operator
&
(
const
fvec4
&
other
)
const
{
fvec4
operator
&
(
fvec4
other
)
const
{
return
(
fvec4
)
(((
__m128i
)
val
)
&
((
__m128i
)
other
.
val
));
return
(
fvec4
)
(((
__m128i
)
val
)
&
((
__m128i
)
other
.
val
));
}
}
fvec4
operator
|
(
const
fvec4
&
other
)
const
{
fvec4
operator
|
(
fvec4
other
)
const
{
return
(
fvec4
)
(((
__m128i
)
val
)
|
((
__m128i
)
other
.
val
));
return
(
fvec4
)
(((
__m128i
)
val
)
|
((
__m128i
)
other
.
val
));
}
}
ivec4
operator
==
(
const
fvec4
&
other
)
const
;
ivec4
operator
==
(
fvec4
other
)
const
;
ivec4
operator
!=
(
const
fvec4
&
other
)
const
;
ivec4
operator
!=
(
fvec4
other
)
const
;
ivec4
operator
>
(
const
fvec4
&
other
)
const
;
ivec4
operator
>
(
fvec4
other
)
const
;
ivec4
operator
<
(
const
fvec4
&
other
)
const
;
ivec4
operator
<
(
fvec4
other
)
const
;
ivec4
operator
>=
(
const
fvec4
&
other
)
const
;
ivec4
operator
>=
(
fvec4
other
)
const
;
ivec4
operator
<=
(
const
fvec4
&
other
)
const
;
ivec4
operator
<=
(
fvec4
other
)
const
;
operator
ivec4
()
const
;
operator
ivec4
()
const
;
/**
/**
...
@@ -171,49 +171,49 @@ public:
...
@@ -171,49 +171,49 @@ public:
void
store
(
int
*
v
)
const
{
void
store
(
int
*
v
)
const
{
*
((
__m128
*
)
v
)
=
val
;
*
((
__m128
*
)
v
)
=
val
;
}
}
ivec4
operator
+
(
const
ivec4
&
other
)
const
{
ivec4
operator
+
(
ivec4
other
)
const
{
return
val
+
other
;
return
val
+
other
;
}
}
ivec4
operator
-
(
const
ivec4
&
other
)
const
{
ivec4
operator
-
(
ivec4
other
)
const
{
return
val
-
other
;
return
val
-
other
;
}
}
ivec4
operator
*
(
const
ivec4
&
other
)
const
{
ivec4
operator
*
(
ivec4
other
)
const
{
return
val
*
other
;
return
val
*
other
;
}
}
void
operator
+=
(
const
ivec4
&
other
)
{
void
operator
+=
(
ivec4
other
)
{
val
=
val
+
other
;
val
=
val
+
other
;
}
}
void
operator
-=
(
const
ivec4
&
other
)
{
void
operator
-=
(
ivec4
other
)
{
val
=
val
-
other
;
val
=
val
-
other
;
}
}
void
operator
*=
(
const
ivec4
&
other
)
{
void
operator
*=
(
ivec4
other
)
{
val
=
val
*
other
;
val
=
val
*
other
;
}
}
ivec4
operator
-
()
const
{
ivec4
operator
-
()
const
{
return
-
val
;
return
-
val
;
}
}
ivec4
operator
&
(
const
ivec4
&
other
)
const
{
ivec4
operator
&
(
ivec4
other
)
const
{
return
val
&
other
.
val
;
return
val
&
other
.
val
;
}
}
ivec4
operator
|
(
const
ivec4
&
other
)
const
{
ivec4
operator
|
(
ivec4
other
)
const
{
return
val
|
other
.
val
;
return
val
|
other
.
val
;
}
}
ivec4
operator
==
(
const
ivec4
&
other
)
const
{
ivec4
operator
==
(
ivec4
other
)
const
{
return
(
val
==
other
.
val
);
return
(
val
==
other
.
val
);
}
}
ivec4
operator
!=
(
const
ivec4
&
other
)
const
{
ivec4
operator
!=
(
ivec4
other
)
const
{
return
(
val
!=
other
.
val
);
return
(
val
!=
other
.
val
);
}
}
ivec4
operator
>
(
const
ivec4
&
other
)
const
{
ivec4
operator
>
(
ivec4
other
)
const
{
return
(
val
>
other
.
val
);
return
(
val
>
other
.
val
);
}
}
ivec4
operator
<
(
const
ivec4
&
other
)
const
{
ivec4
operator
<
(
ivec4
other
)
const
{
return
(
val
<
other
.
val
);
return
(
val
<
other
.
val
);
}
}
ivec4
operator
>=
(
const
ivec4
&
other
)
const
{
ivec4
operator
>=
(
ivec4
other
)
const
{
return
(
val
>=
other
.
val
);
return
(
val
>=
other
.
val
);
}
}
ivec4
operator
<=
(
const
ivec4
&
other
)
const
{
ivec4
operator
<=
(
ivec4
other
)
const
{
return
(
val
<=
other
.
val
);
return
(
val
<=
other
.
val
);
}
}
operator
fvec4
()
const
;
operator
fvec4
()
const
;
...
@@ -221,27 +221,27 @@ public:
...
@@ -221,27 +221,27 @@ public:
// Conversion operators.
// Conversion operators.
inline
ivec4
fvec4
::
operator
==
(
const
fvec4
&
other
)
const
{
inline
ivec4
fvec4
::
operator
==
(
fvec4
other
)
const
{
return
(
__m128i
)
(
val
==
other
.
val
);
return
(
__m128i
)
(
val
==
other
.
val
);
}
}
inline
ivec4
fvec4
::
operator
!=
(
const
fvec4
&
other
)
const
{
inline
ivec4
fvec4
::
operator
!=
(
fvec4
other
)
const
{
return
(
__m128i
)
(
val
!=
other
.
val
);
return
(
__m128i
)
(
val
!=
other
.
val
);
}
}
inline
ivec4
fvec4
::
operator
>
(
const
fvec4
&
other
)
const
{
inline
ivec4
fvec4
::
operator
>
(
fvec4
other
)
const
{
return
(
__m128i
)
(
val
>
other
.
val
);
return
(
__m128i
)
(
val
>
other
.
val
);
}
}
inline
ivec4
fvec4
::
operator
<
(
const
fvec4
&
other
)
const
{
inline
ivec4
fvec4
::
operator
<
(
fvec4
other
)
const
{
return
(
__m128i
)
(
val
<
other
.
val
);
return
(
__m128i
)
(
val
<
other
.
val
);
}
}
inline
ivec4
fvec4
::
operator
>=
(
const
fvec4
&
other
)
const
{
inline
ivec4
fvec4
::
operator
>=
(
fvec4
other
)
const
{
return
(
__m128i
)
(
val
>=
other
.
val
);
return
(
__m128i
)
(
val
>=
other
.
val
);
}
}
inline
ivec4
fvec4
::
operator
<=
(
const
fvec4
&
other
)
const
{
inline
ivec4
fvec4
::
operator
<=
(
fvec4
other
)
const
{
return
(
__m128i
)
(
val
<=
other
.
val
);
return
(
__m128i
)
(
val
<=
other
.
val
);
}
}
...
@@ -262,34 +262,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
...
@@ -262,34 +262,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
// Functions that operate on fvec4s.
// Functions that operate on fvec4s.
static
inline
fvec4
abs
(
const
fvec4
&
v
)
{
static
inline
fvec4
abs
(
fvec4
v
)
{
return
v
&
(
__m128
)
ivec4
(
0x7FFFFFFF
);
return
v
&
(
__m128
)
ivec4
(
0x7FFFFFFF
);
}
}
static
inline
fvec4
exp
(
const
fvec4
&
v
)
{
static
inline
fvec4
exp
(
fvec4
v
)
{
return
fvec4
(
expf
(
v
[
0
]),
expf
(
v
[
1
]),
expf
(
v
[
2
]),
expf
(
v
[
3
]));
return
fvec4
(
expf
(
v
[
0
]),
expf
(
v
[
1
]),
expf
(
v
[
2
]),
expf
(
v
[
3
]));
}
}
static
inline
fvec4
log
(
const
fvec4
&
v
)
{
static
inline
fvec4
log
(
fvec4
v
)
{
return
fvec4
(
logf
(
v
[
0
]),
logf
(
v
[
1
]),
logf
(
v
[
2
]),
logf
(
v
[
3
]));
return
fvec4
(
logf
(
v
[
0
]),
logf
(
v
[
1
]),
logf
(
v
[
2
]),
logf
(
v
[
3
]));
}
}
static
inline
float
dot3
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
float
dot3
(
fvec4
v1
,
fvec4
v2
)
{
fvec4
r
=
v1
*
v2
;
fvec4
r
=
v1
*
v2
;
return
r
[
0
]
+
r
[
1
]
+
r
[
2
];
return
r
[
0
]
+
r
[
1
]
+
r
[
2
];
}
}
static
inline
float
dot4
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
float
dot4
(
fvec4
v1
,
fvec4
v2
)
{
fvec4
r
=
v1
*
v2
;
fvec4
r
=
v1
*
v2
;
fvec4
temp
=
__builtin_shufflevector
(
r
.
val
,
r
.
val
,
0
,
1
,
-
1
,
-
1
)
+
__builtin_shufflevector
(
r
.
val
,
r
.
val
,
2
,
3
,
-
1
,
-
1
);
fvec4
temp
=
__builtin_shufflevector
(
r
.
val
,
r
.
val
,
0
,
1
,
-
1
,
-
1
)
+
__builtin_shufflevector
(
r
.
val
,
r
.
val
,
2
,
3
,
-
1
,
-
1
);
return
temp
[
0
]
+
temp
[
1
];
return
temp
[
0
]
+
temp
[
1
];
}
}
static
inline
float
reduceAdd
(
const
fvec4
v
)
{
static
inline
float
reduceAdd
(
fvec4
v
)
{
return
dot4
(
v
,
fvec4
(
1.0
f
));
return
dot4
(
v
,
fvec4
(
1.0
f
));
}
}
static
inline
fvec4
cross
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
cross
(
fvec4
v1
,
fvec4
v2
)
{
__m128
temp
=
v2
.
val
*
__builtin_shufflevector
(
v1
.
val
,
v1
.
val
,
2
,
0
,
1
,
3
)
-
__m128
temp
=
v2
.
val
*
__builtin_shufflevector
(
v1
.
val
,
v1
.
val
,
2
,
0
,
1
,
3
)
-
v1
.
val
*
__builtin_shufflevector
(
v2
.
val
,
v2
.
val
,
2
,
0
,
1
,
3
);
v1
.
val
*
__builtin_shufflevector
(
v2
.
val
,
v2
.
val
,
2
,
0
,
1
,
3
);
return
__builtin_shufflevector
(
temp
,
temp
,
2
,
0
,
1
,
3
);
return
__builtin_shufflevector
(
temp
,
temp
,
2
,
0
,
1
,
3
);
...
@@ -317,85 +317,89 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
...
@@ -317,85 +317,89 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
/**
/**
* Out-of-place transpose from named variables into an array.
* Out-of-place transpose from named variables into an array.
*/
*/
static
inline
void
transpose
(
const
fvec4
v0
,
const
fvec4
v1
,
const
fvec4
v2
,
const
fvec4
v3
,
fvec4
out
[
4
])
{
static
inline
void
transpose
(
fvec4
v0
,
fvec4
v1
,
fvec4
v2
,
fvec4
v3
,
fvec4
out
[
4
])
{
out
[
0
]
=
v0
;
out
[
1
]
=
v1
;
out
[
2
]
=
v2
;
out
[
3
]
=
v3
;
out
[
0
]
=
v0
;
out
[
1
]
=
v1
;
out
[
2
]
=
v2
;
out
[
3
]
=
v3
;
transpose
(
out
[
0
],
out
[
1
],
out
[
2
],
out
[
3
]);
transpose
(
out
[
0
],
out
[
1
],
out
[
2
],
out
[
3
]);
}
}
// Functions that operate on ivec4s.
// Functions that operate on ivec4s.
static
inline
ivec4
min
(
const
ivec4
&
v1
,
const
ivec4
&
v2
)
{
static
inline
ivec4
min
(
ivec4
v1
,
ivec4
v2
)
{
return
ivec4
(
std
::
min
(
v1
[
0
],
v2
[
0
]),
std
::
min
(
v1
[
1
],
v2
[
1
]),
std
::
min
(
v1
[
2
],
v2
[
2
]),
std
::
min
(
v1
[
3
],
v2
[
3
]));
return
ivec4
(
std
::
min
(
v1
[
0
],
v2
[
0
]),
std
::
min
(
v1
[
1
],
v2
[
1
]),
std
::
min
(
v1
[
2
],
v2
[
2
]),
std
::
min
(
v1
[
3
],
v2
[
3
]));
}
}
static
inline
ivec4
max
(
const
ivec4
&
v1
,
const
ivec4
&
v2
)
{
static
inline
ivec4
max
(
ivec4
v1
,
ivec4
v2
)
{
return
ivec4
(
std
::
max
(
v1
[
0
],
v2
[
0
]),
std
::
max
(
v1
[
1
],
v2
[
1
]),
std
::
max
(
v1
[
2
],
v2
[
2
]),
std
::
max
(
v1
[
3
],
v2
[
3
]));
return
ivec4
(
std
::
max
(
v1
[
0
],
v2
[
0
]),
std
::
max
(
v1
[
1
],
v2
[
1
]),
std
::
max
(
v1
[
2
],
v2
[
2
]),
std
::
max
(
v1
[
3
],
v2
[
3
]));
}
}
static
inline
ivec4
abs
(
const
ivec4
&
v
)
{
static
inline
ivec4
abs
(
ivec4
v
)
{
return
ivec4
(
abs
(
v
[
0
]),
abs
(
v
[
1
]),
abs
(
v
[
2
]),
abs
(
v
[
3
]));
return
ivec4
(
abs
(
v
[
0
]),
abs
(
v
[
1
]),
abs
(
v
[
2
]),
abs
(
v
[
3
]));
}
}
static
inline
bool
any
(
const
__m128i
&
v
)
{
static
inline
bool
any
(
__m128i
v
)
{
ivec4
temp
=
__builtin_shufflevector
(
v
,
v
,
0
,
1
,
-
1
,
-
1
)
|
__builtin_shufflevector
(
v
,
v
,
2
,
3
,
-
1
,
-
1
);
ivec4
temp
=
__builtin_shufflevector
(
v
,
v
,
0
,
1
,
-
1
,
-
1
)
|
__builtin_shufflevector
(
v
,
v
,
2
,
3
,
-
1
,
-
1
);
return
(
temp
[
0
]
||
temp
[
1
]);
return
(
temp
[
0
]
||
temp
[
1
]);
}
}
// Mathematical operators involving a scalar and a vector.
// Mathematical operators involving a scalar and a vector.
static
inline
fvec4
operator
+
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
+
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
+
v2
;
return
fvec4
(
v1
)
+
v2
;
}
}
static
inline
fvec4
operator
-
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
-
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
-
v2
;
return
fvec4
(
v1
)
-
v2
;
}
}
static
inline
fvec4
operator
*
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
*
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
*
v2
;
return
fvec4
(
v1
)
*
v2
;
}
}
static
inline
fvec4
operator
/
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
/
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
/
v2
;
return
fvec4
(
v1
)
/
v2
;
}
}
// Operations for blending fvec4s based on an ivec4.
// Operations for blending fvec4s based on an ivec4.
static
inline
fvec4
blend
(
const
fvec4
&
v1
,
const
fvec4
&
v2
,
const
__m128i
&
mask
)
{
static
inline
fvec4
blend
(
fvec4
v1
,
fvec4
v2
,
__m128i
mask
)
{
return
(
__m128
)
((
mask
&
(
__m128i
)
v2
)
+
((
ivec4
(
0xFFFFFFFF
)
-
ivec4
(
mask
))
&
(
__m128i
)
v1
));
return
(
__m128
)
((
mask
&
(
__m128i
)
v2
)
+
((
ivec4
(
0xFFFFFFFF
)
-
ivec4
(
mask
))
&
(
__m128i
)
v1
));
}
}
static
inline
fvec4
blendZero
(
const
fvec4
v
,
const
ivec4
mask
)
{
static
inline
fvec4
blendZero
(
fvec4
v
,
ivec4
mask
)
{
return
blend
(
0.0
f
,
v
,
mask
);
return
blend
(
0.0
f
,
v
,
mask
);
}
}
static
inline
ivec4
blendZero
(
ivec4
v
,
ivec4
mask
)
{
return
v
&
mask
;
}
// These are at the end since they involve other functions defined above.
// These are at the end since they involve other functions defined above.
static
inline
fvec4
min
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
min
(
fvec4
v1
,
fvec4
v2
)
{
return
blend
(
v1
,
v2
,
v1
>
v2
);
return
blend
(
v1
,
v2
,
v1
>
v2
);
}
}
static
inline
fvec4
max
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
max
(
fvec4
v1
,
fvec4
v2
)
{
return
blend
(
v1
,
v2
,
v1
<
v2
);
return
blend
(
v1
,
v2
,
v1
<
v2
);
}
}
static
inline
fvec4
round
(
const
fvec4
&
v
)
{
static
inline
fvec4
round
(
fvec4
v
)
{
fvec4
shift
(
0x1
.0
p23f
);
fvec4
shift
(
0x1
.0
p23f
);
fvec4
absResult
=
(
abs
(
v
)
+
shift
)
-
shift
;
fvec4
absResult
=
(
abs
(
v
)
+
shift
)
-
shift
;
return
(
__m128
)
((
ivec4
(
0x80000000
)
&
(
__m128i
)
v
)
+
(
ivec4
(
0x7FFFFFFF
)
&
(
__m128i
)
absResult
));
return
(
__m128
)
((
ivec4
(
0x80000000
)
&
(
__m128i
)
v
)
+
(
ivec4
(
0x7FFFFFFF
)
&
(
__m128i
)
absResult
));
}
}
static
inline
fvec4
floor
(
const
fvec4
&
v
)
{
static
inline
fvec4
floor
(
fvec4
v
)
{
fvec4
truncated
=
__builtin_convertvector
(
__builtin_convertvector
(
v
.
val
,
__m128i
),
__m128
);
fvec4
truncated
=
__builtin_convertvector
(
__builtin_convertvector
(
v
.
val
,
__m128i
),
__m128
);
return
truncated
+
blend
(
0.0
f
,
-
1.0
f
,
truncated
>
v
);
return
truncated
+
blend
(
0.0
f
,
-
1.0
f
,
truncated
>
v
);
}
}
static
inline
fvec4
ceil
(
const
fvec4
&
v
)
{
static
inline
fvec4
ceil
(
fvec4
v
)
{
fvec4
truncated
=
__builtin_convertvector
(
__builtin_convertvector
(
v
.
val
,
__m128i
),
__m128
);
fvec4
truncated
=
__builtin_convertvector
(
__builtin_convertvector
(
v
.
val
,
__m128i
),
__m128
);
return
truncated
+
blend
(
0.0
f
,
1.0
f
,
truncated
<
v
);
return
truncated
+
blend
(
0.0
f
,
1.0
f
,
truncated
<
v
);
}
}
static
inline
fvec4
rsqrt
(
const
fvec4
&
v
)
{
static
inline
fvec4
rsqrt
(
fvec4
v
)
{
// Initial estimate of rsqrt().
// Initial estimate of rsqrt().
ivec4
i
=
(
__m128i
)
v
;
ivec4
i
=
(
__m128i
)
v
;
...
@@ -411,7 +415,7 @@ static inline fvec4 rsqrt(const fvec4& v) {
...
@@ -411,7 +415,7 @@ static inline fvec4 rsqrt(const fvec4& v) {
return
y
;
return
y
;
}
}
static
inline
fvec4
sqrt
(
const
fvec4
&
v
)
{
static
inline
fvec4
sqrt
(
fvec4
v
)
{
return
rsqrt
(
v
)
*
v
;
return
rsqrt
(
v
)
*
v
;
}
}
...
@@ -420,7 +424,7 @@ static inline fvec4 sqrt(const fvec4& v) {
...
@@ -420,7 +424,7 @@ static inline fvec4 sqrt(const fvec4& v) {
* of vectors. The first result vector contains the values at the given indexes, and the second
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
* result vector contains the values from each respective index+1.
*/
*/
static
inline
void
gatherVecPair
(
const
float
*
table
,
const
ivec4
index
,
fvec4
&
out0
,
fvec4
&
out1
)
{
static
inline
void
gatherVecPair
(
const
float
*
table
,
ivec4
index
,
fvec4
&
out0
,
fvec4
&
out1
)
{
fvec4
t0
(
table
+
index
[
0
]);
fvec4
t0
(
table
+
index
[
0
]);
fvec4
t1
(
table
+
index
[
1
]);
fvec4
t1
(
table
+
index
[
1
]);
fvec4
t2
(
table
+
index
[
2
]);
fvec4
t2
(
table
+
index
[
2
]);
...
@@ -443,7 +447,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
...
@@ -443,7 +447,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined
* output[3] = undefined
*/
*/
static
inline
fvec4
reduceToVec3
(
const
fvec4
x
,
const
fvec4
y
,
const
fvec4
z
)
{
static
inline
fvec4
reduceToVec3
(
fvec4
x
,
fvec4
y
,
fvec4
z
)
{
const
auto
nx
=
reduceAdd
(
x
);
const
auto
nx
=
reduceAdd
(
x
);
const
auto
ny
=
reduceAdd
(
y
);
const
auto
ny
=
reduceAdd
(
y
);
const
auto
nz
=
reduceAdd
(
z
);
const
auto
nz
=
reduceAdd
(
z
);
...
...
openmmapi/include/openmm/internal/vectorize_ppc.h
View file @
d36e3414
...
@@ -97,45 +97,45 @@ public:
...
@@ -97,45 +97,45 @@ public:
v
[
2
]
=
val
[
2
];
v
[
2
]
=
val
[
2
];
}
}
fvec4
operator
+
(
const
fvec4
&
other
)
const
{
fvec4
operator
+
(
fvec4
other
)
const
{
return
vec_add
(
val
,
other
.
val
);
return
vec_add
(
val
,
other
.
val
);
}
}
fvec4
operator
-
(
const
fvec4
&
other
)
const
{
fvec4
operator
-
(
fvec4
other
)
const
{
return
vec_sub
(
val
,
other
.
val
);
return
vec_sub
(
val
,
other
.
val
);
}
}
fvec4
operator
*
(
const
fvec4
&
other
)
const
{
fvec4
operator
*
(
fvec4
other
)
const
{
return
vec_mul
(
val
,
other
.
val
);
return
vec_mul
(
val
,
other
.
val
);
}
}
fvec4
operator
/
(
const
fvec4
&
other
)
const
{
fvec4
operator
/
(
fvec4
other
)
const
{
return
vec_div
(
val
,
other
.
val
);
return
vec_div
(
val
,
other
.
val
);
}
}
void
operator
+=
(
const
fvec4
&
other
)
{
void
operator
+=
(
fvec4
other
)
{
val
=
vec_add
(
val
,
other
.
val
);
val
=
vec_add
(
val
,
other
.
val
);
}
}
void
operator
-=
(
const
fvec4
&
other
)
{
void
operator
-=
(
fvec4
other
)
{
val
=
vec_sub
(
val
,
other
.
val
);
val
=
vec_sub
(
val
,
other
.
val
);
}
}
void
operator
*=
(
const
fvec4
&
other
)
{
void
operator
*=
(
fvec4
other
)
{
val
=
vec_mul
(
val
,
other
.
val
);
val
=
vec_mul
(
val
,
other
.
val
);
}
}
void
operator
/=
(
const
fvec4
&
other
)
{
void
operator
/=
(
fvec4
other
)
{
val
=
vec_div
(
val
,
other
.
val
);
val
=
vec_div
(
val
,
other
.
val
);
}
}
fvec4
operator
-
()
const
{
fvec4
operator
-
()
const
{
return
-
val
;
return
-
val
;
}
}
fvec4
operator
&
(
const
fvec4
&
other
)
const
{
fvec4
operator
&
(
fvec4
other
)
const
{
return
vec_and
(
val
,
other
.
val
);
return
vec_and
(
val
,
other
.
val
);
}
}
fvec4
operator
|
(
const
fvec4
&
other
)
const
{
fvec4
operator
|
(
fvec4
other
)
const
{
return
vec_or
(
val
,
other
.
val
);
return
vec_or
(
val
,
other
.
val
);
}
}
ivec4
operator
==
(
const
fvec4
&
other
)
const
;
ivec4
operator
==
(
fvec4
other
)
const
;
ivec4
operator
!=
(
const
fvec4
&
other
)
const
;
ivec4
operator
!=
(
fvec4
other
)
const
;
ivec4
operator
>
(
const
fvec4
&
other
)
const
;
ivec4
operator
>
(
fvec4
other
)
const
;
ivec4
operator
<
(
const
fvec4
&
other
)
const
;
ivec4
operator
<
(
fvec4
other
)
const
;
ivec4
operator
>=
(
const
fvec4
&
other
)
const
;
ivec4
operator
>=
(
fvec4
other
)
const
;
ivec4
operator
<=
(
const
fvec4
&
other
)
const
;
ivec4
operator
<=
(
fvec4
other
)
const
;
operator
ivec4
()
const
;
operator
ivec4
()
const
;
/***
/***
...
@@ -173,49 +173,49 @@ public:
...
@@ -173,49 +173,49 @@ public:
void
store
(
int
*
v
)
const
{
void
store
(
int
*
v
)
const
{
*
((
__m128i
*
)
v
)
=
val
;
*
((
__m128i
*
)
v
)
=
val
;
}
}
ivec4
operator
+
(
const
ivec4
&
other
)
const
{
ivec4
operator
+
(
ivec4
other
)
const
{
return
vec_add
(
val
,
other
.
val
);
return
vec_add
(
val
,
other
.
val
);
}
}
ivec4
operator
-
(
const
ivec4
&
other
)
const
{
ivec4
operator
-
(
ivec4
other
)
const
{
return
vec_sub
(
val
,
other
.
val
);
return
vec_sub
(
val
,
other
.
val
);
}
}
ivec4
operator
*
(
const
ivec4
&
other
)
const
{
ivec4
operator
*
(
ivec4
other
)
const
{
return
val
*
other
.
val
;
return
val
*
other
.
val
;
}
}
void
operator
+=
(
const
ivec4
&
other
)
{
void
operator
+=
(
ivec4
other
)
{
val
=
vec_add
(
val
,
other
.
val
);
val
=
vec_add
(
val
,
other
.
val
);
}
}
void
operator
-=
(
const
ivec4
&
other
)
{
void
operator
-=
(
ivec4
other
)
{
val
=
vec_sub
(
val
,
other
.
val
);
val
=
vec_sub
(
val
,
other
.
val
);
}
}
void
operator
*=
(
const
ivec4
&
other
)
{
void
operator
*=
(
ivec4
other
)
{
val
=
val
*
other
.
val
;
val
=
val
*
other
.
val
;
}
}
ivec4
operator
-
()
const
{
ivec4
operator
-
()
const
{
return
-
val
;
return
-
val
;
}
}
ivec4
operator
&
(
const
ivec4
&
other
)
const
{
ivec4
operator
&
(
ivec4
other
)
const
{
return
val
&
other
.
val
;
return
val
&
other
.
val
;
}
}
ivec4
operator
|
(
const
ivec4
&
other
)
const
{
ivec4
operator
|
(
ivec4
other
)
const
{
return
val
|
other
.
val
;
return
val
|
other
.
val
;
}
}
ivec4
operator
==
(
const
ivec4
&
other
)
const
{
ivec4
operator
==
(
ivec4
other
)
const
{
return
(
val
==
other
.
val
);
return
(
val
==
other
.
val
);
}
}
ivec4
operator
!=
(
const
ivec4
&
other
)
const
{
ivec4
operator
!=
(
ivec4
other
)
const
{
return
(
val
!=
other
.
val
);
return
(
val
!=
other
.
val
);
}
}
ivec4
operator
>
(
const
ivec4
&
other
)
const
{
ivec4
operator
>
(
ivec4
other
)
const
{
return
(
val
>
other
.
val
);
return
(
val
>
other
.
val
);
}
}
ivec4
operator
<
(
const
ivec4
&
other
)
const
{
ivec4
operator
<
(
ivec4
other
)
const
{
return
(
val
<
other
.
val
);
return
(
val
<
other
.
val
);
}
}
ivec4
operator
>=
(
const
ivec4
&
other
)
const
{
ivec4
operator
>=
(
ivec4
other
)
const
{
return
(
val
>=
other
.
val
);
return
(
val
>=
other
.
val
);
}
}
ivec4
operator
<=
(
const
ivec4
&
other
)
const
{
ivec4
operator
<=
(
ivec4
other
)
const
{
return
(
val
<=
other
.
val
);
return
(
val
<=
other
.
val
);
}
}
operator
fvec4
()
const
;
operator
fvec4
()
const
;
...
@@ -223,27 +223,27 @@ public:
...
@@ -223,27 +223,27 @@ public:
// Conversion operators.
// Conversion operators.
inline
ivec4
fvec4
::
operator
==
(
const
fvec4
&
other
)
const
{
inline
ivec4
fvec4
::
operator
==
(
fvec4
other
)
const
{
return
(
val
==
other
.
val
);
return
(
val
==
other
.
val
);
}
}
inline
ivec4
fvec4
::
operator
!=
(
const
fvec4
&
other
)
const
{
inline
ivec4
fvec4
::
operator
!=
(
fvec4
other
)
const
{
return
(
val
!=
other
.
val
);
return
(
val
!=
other
.
val
);
}
}
inline
ivec4
fvec4
::
operator
>
(
const
fvec4
&
other
)
const
{
inline
ivec4
fvec4
::
operator
>
(
fvec4
other
)
const
{
return
(
val
>
other
.
val
);
return
(
val
>
other
.
val
);
}
}
inline
ivec4
fvec4
::
operator
<
(
const
fvec4
&
other
)
const
{
inline
ivec4
fvec4
::
operator
<
(
fvec4
other
)
const
{
return
(
val
<
other
.
val
);
return
(
val
<
other
.
val
);
}
}
inline
ivec4
fvec4
::
operator
>=
(
const
fvec4
&
other
)
const
{
inline
ivec4
fvec4
::
operator
>=
(
fvec4
other
)
const
{
return
(
val
>=
other
.
val
);
return
(
val
>=
other
.
val
);
}
}
inline
ivec4
fvec4
::
operator
<=
(
const
fvec4
&
other
)
const
{
inline
ivec4
fvec4
::
operator
<=
(
fvec4
other
)
const
{
return
(
val
<=
other
.
val
);
return
(
val
<=
other
.
val
);
}
}
...
@@ -264,34 +264,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
...
@@ -264,34 +264,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
// Functions that operate on fvec4s.
// Functions that operate on fvec4s.
static
inline
fvec4
abs
(
const
fvec4
&
v
)
{
static
inline
fvec4
abs
(
fvec4
v
)
{
return
vec_abs
(
v
.
val
);
return
vec_abs
(
v
.
val
);
}
}
static
inline
fvec4
exp
(
const
fvec4
&
v
)
{
static
inline
fvec4
exp
(
fvec4
v
)
{
return
fvec4
(
expf
(
v
[
0
]),
expf
(
v
[
1
]),
expf
(
v
[
2
]),
expf
(
v
[
3
]));
return
fvec4
(
expf
(
v
[
0
]),
expf
(
v
[
1
]),
expf
(
v
[
2
]),
expf
(
v
[
3
]));
}
}
static
inline
fvec4
log
(
const
fvec4
&
v
)
{
static
inline
fvec4
log
(
fvec4
v
)
{
return
fvec4
(
logf
(
v
[
0
]),
logf
(
v
[
1
]),
logf
(
v
[
2
]),
logf
(
v
[
3
]));
return
fvec4
(
logf
(
v
[
0
]),
logf
(
v
[
1
]),
logf
(
v
[
2
]),
logf
(
v
[
3
]));
}
}
static
inline
float
dot3
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
float
dot3
(
fvec4
v1
,
fvec4
v2
)
{
fvec4
r
=
v1
*
v2
;
fvec4
r
=
v1
*
v2
;
return
r
[
0
]
+
r
[
1
]
+
r
[
2
];
return
r
[
0
]
+
r
[
1
]
+
r
[
2
];
}
}
static
inline
float
dot4
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
float
dot4
(
fvec4
v1
,
fvec4
v2
)
{
fvec4
r
=
v1
*
v2
;
fvec4
r
=
v1
*
v2
;
fvec4
temp
=
r
+
vec_sld
(
r
.
val
,
r
.
val
,
8
);
fvec4
temp
=
r
+
vec_sld
(
r
.
val
,
r
.
val
,
8
);
return
temp
[
0
]
+
temp
[
1
];
return
temp
[
0
]
+
temp
[
1
];
}
}
static
inline
float
reduceAdd
(
const
fvec4
v
)
{
static
inline
float
reduceAdd
(
fvec4
v
)
{
return
dot4
(
v
,
fvec4
(
1.0
f
));
return
dot4
(
v
,
fvec4
(
1.0
f
));
}
}
static
inline
fvec4
cross
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
cross
(
fvec4
v1
,
fvec4
v2
)
{
vector
unsigned
char
perm
=
(
vector
unsigned
char
)
{
8
,
9
,
10
,
11
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
12
,
13
,
14
,
15
};
vector
unsigned
char
perm
=
(
vector
unsigned
char
)
{
8
,
9
,
10
,
11
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
12
,
13
,
14
,
15
};
__m128
temp
=
v2
.
val
*
vec_perm
(
v1
.
val
,
v1
.
val
,
perm
)
-
__m128
temp
=
v2
.
val
*
vec_perm
(
v1
.
val
,
v1
.
val
,
perm
)
-
v1
.
val
*
vec_perm
(
v2
.
val
,
v2
.
val
,
perm
);
v1
.
val
*
vec_perm
(
v2
.
val
,
v2
.
val
,
perm
);
...
@@ -324,80 +324,84 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
...
@@ -324,80 +324,84 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
/**
/**
* Out-of-place transpose from named variables into an array.
* Out-of-place transpose from named variables into an array.
*/
*/
static
inline
void
transpose
(
const
fvec4
v0
,
const
fvec4
v1
,
const
fvec4
v2
,
const
fvec4
v3
,
fvec4
out
[
4
])
{
static
inline
void
transpose
(
fvec4
v0
,
fvec4
v1
,
fvec4
v2
,
fvec4
v3
,
fvec4
out
[
4
])
{
out
[
0
]
=
v0
;
out
[
1
]
=
v1
;
out
[
2
]
=
v2
;
out
[
3
]
=
v3
;
out
[
0
]
=
v0
;
out
[
1
]
=
v1
;
out
[
2
]
=
v2
;
out
[
3
]
=
v3
;
transpose
(
out
[
0
],
out
[
1
],
out
[
2
],
out
[
3
]);
transpose
(
out
[
0
],
out
[
1
],
out
[
2
],
out
[
3
]);
}
}
// Functions that operate on ivec4s.
// Functions that operate on ivec4s.
static
inline
ivec4
min
(
const
ivec4
&
v1
,
const
ivec4
&
v2
)
{
static
inline
ivec4
min
(
ivec4
v1
,
ivec4
v2
)
{
return
vec_min
(
v1
.
val
,
v2
.
val
);
return
vec_min
(
v1
.
val
,
v2
.
val
);
}
}
static
inline
ivec4
max
(
const
ivec4
&
v1
,
const
ivec4
&
v2
)
{
static
inline
ivec4
max
(
ivec4
v1
,
ivec4
v2
)
{
return
vec_max
(
v1
.
val
,
v2
.
val
);
return
vec_max
(
v1
.
val
,
v2
.
val
);
}
}
static
inline
ivec4
abs
(
const
ivec4
&
v
)
{
static
inline
ivec4
abs
(
ivec4
v
)
{
return
vec_abs
(
v
.
val
);
return
vec_abs
(
v
.
val
);
}
}
static
inline
bool
any
(
const
ivec4
v
)
{
static
inline
bool
any
(
ivec4
v
)
{
return
!
vec_all_eq
(
v
.
val
,
ivec4
(
0
).
val
);
return
!
vec_all_eq
(
v
.
val
,
ivec4
(
0
).
val
);
}
}
// Mathematical operators involving a scalar and a vector.
// Mathematical operators involving a scalar and a vector.
static
inline
fvec4
operator
+
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
+
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
+
v2
;
return
fvec4
(
v1
)
+
v2
;
}
}
static
inline
fvec4
operator
-
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
-
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
-
v2
;
return
fvec4
(
v1
)
-
v2
;
}
}
static
inline
fvec4
operator
*
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
*
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
*
v2
;
return
fvec4
(
v1
)
*
v2
;
}
}
static
inline
fvec4
operator
/
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
/
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
/
v2
;
return
fvec4
(
v1
)
/
v2
;
}
}
// Operations for blending fvec4s based on an ivec4.
// Operations for blending fvec4s based on an ivec4.
static
inline
fvec4
blend
(
const
fvec4
&
v1
,
const
fvec4
&
v2
,
const
__m128i
&
mask
)
{
static
inline
fvec4
blend
(
fvec4
v1
,
fvec4
v2
,
__m128i
mask
)
{
return
(
__m128
)
((
mask
&
(
__m128i
)
v2
.
val
)
+
((
ivec4
(
0xFFFFFFFF
)
-
ivec4
(
mask
))
&
(
__m128i
)
v1
.
val
).
val
);
return
(
__m128
)
((
mask
&
(
__m128i
)
v2
.
val
)
+
((
ivec4
(
0xFFFFFFFF
)
-
ivec4
(
mask
))
&
(
__m128i
)
v1
.
val
).
val
);
}
}
static
inline
fvec4
blendZero
(
const
fvec4
v
,
const
ivec4
mask
)
{
static
inline
fvec4
blendZero
(
fvec4
v
,
ivec4
mask
)
{
return
blend
(
0.0
f
,
v
,
mask
);
return
blend
(
0.0
f
,
v
,
mask
);
}
}
static
inline
ivec4
blendZero
(
ivec4
v
,
ivec4
mask
)
{
return
v
&
mask
;
}
// These are at the end since they involve other functions defined above.
// These are at the end since they involve other functions defined above.
static
inline
fvec4
min
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
min
(
fvec4
v1
,
fvec4
v2
)
{
return
vec_min
(
v1
.
val
,
v2
.
val
);
return
vec_min
(
v1
.
val
,
v2
.
val
);
}
}
static
inline
fvec4
max
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
max
(
fvec4
v1
,
fvec4
v2
)
{
return
vec_max
(
v1
.
val
,
v2
.
val
);
return
vec_max
(
v1
.
val
,
v2
.
val
);
}
}
static
inline
fvec4
round
(
const
fvec4
&
v
)
{
static
inline
fvec4
round
(
fvec4
v
)
{
return
vec_round
(
v
.
val
);
return
vec_round
(
v
.
val
);
}
}
static
inline
fvec4
floor
(
const
fvec4
&
v
)
{
static
inline
fvec4
floor
(
fvec4
v
)
{
return
vec_floor
(
v
.
val
);
return
vec_floor
(
v
.
val
);
}
}
static
inline
fvec4
ceil
(
const
fvec4
&
v
)
{
static
inline
fvec4
ceil
(
fvec4
v
)
{
return
vec_ceil
(
v
.
val
);
return
vec_ceil
(
v
.
val
);
}
}
static
inline
fvec4
rsqrt
(
const
fvec4
&
v
)
{
static
inline
fvec4
rsqrt
(
fvec4
v
)
{
// Initial estimate of rsqrt().
// Initial estimate of rsqrt().
fvec4
y
(
vec_rsqrte
(
v
.
val
));
fvec4
y
(
vec_rsqrte
(
v
.
val
));
...
@@ -409,7 +413,7 @@ static inline fvec4 rsqrt(const fvec4& v) {
...
@@ -409,7 +413,7 @@ static inline fvec4 rsqrt(const fvec4& v) {
return
y
;
return
y
;
}
}
static
inline
fvec4
sqrt
(
const
fvec4
&
v
)
{
static
inline
fvec4
sqrt
(
fvec4
v
)
{
return
vec_sqrt
(
v
.
val
);
return
vec_sqrt
(
v
.
val
);
}
}
...
@@ -417,7 +421,7 @@ static inline fvec4 sqrt(const fvec4& v) {
...
@@ -417,7 +421,7 @@ static inline fvec4 sqrt(const fvec4& v) {
* of vectors. The first result vector contains the values at the given indexes, and the second
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
* result vector contains the values from each respective index+1.
*/
*/
static
inline
void
gatherVecPair
(
const
float
*
table
,
const
ivec4
index
,
fvec4
&
out0
,
fvec4
&
out1
)
{
static
inline
void
gatherVecPair
(
const
float
*
table
,
ivec4
index
,
fvec4
&
out0
,
fvec4
&
out1
)
{
fvec4
t0
(
table
+
index
[
0
]);
fvec4
t0
(
table
+
index
[
0
]);
fvec4
t1
(
table
+
index
[
1
]);
fvec4
t1
(
table
+
index
[
1
]);
fvec4
t2
(
table
+
index
[
2
]);
fvec4
t2
(
table
+
index
[
2
]);
...
@@ -440,7 +444,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
...
@@ -440,7 +444,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined
* output[3] = undefined
*/
*/
static
inline
fvec4
reduceToVec3
(
const
fvec4
x
,
const
fvec4
y
,
const
fvec4
z
)
{
static
inline
fvec4
reduceToVec3
(
fvec4
x
,
fvec4
y
,
fvec4
z
)
{
const
auto
nx
=
reduceAdd
(
x
);
const
auto
nx
=
reduceAdd
(
x
);
const
auto
ny
=
reduceAdd
(
y
);
const
auto
ny
=
reduceAdd
(
y
);
const
auto
nz
=
reduceAdd
(
z
);
const
auto
nz
=
reduceAdd
(
z
);
...
...
openmmapi/include/openmm/internal/vectorize_sse.h
View file @
d36e3414
...
@@ -108,55 +108,55 @@ public:
...
@@ -108,55 +108,55 @@ public:
#endif
#endif
}
}
fvec4
operator
+
(
const
fvec4
&
other
)
const
{
fvec4
operator
+
(
fvec4
other
)
const
{
return
_mm_add_ps
(
val
,
other
);
return
_mm_add_ps
(
val
,
other
);
}
}
fvec4
operator
-
(
const
fvec4
&
other
)
const
{
fvec4
operator
-
(
fvec4
other
)
const
{
return
_mm_sub_ps
(
val
,
other
);
return
_mm_sub_ps
(
val
,
other
);
}
}
fvec4
operator
*
(
const
fvec4
&
other
)
const
{
fvec4
operator
*
(
fvec4
other
)
const
{
return
_mm_mul_ps
(
val
,
other
);
return
_mm_mul_ps
(
val
,
other
);
}
}
fvec4
operator
/
(
const
fvec4
&
other
)
const
{
fvec4
operator
/
(
fvec4
other
)
const
{
return
_mm_div_ps
(
val
,
other
);
return
_mm_div_ps
(
val
,
other
);
}
}
void
operator
+=
(
const
fvec4
&
other
)
{
void
operator
+=
(
fvec4
other
)
{
val
=
_mm_add_ps
(
val
,
other
);
val
=
_mm_add_ps
(
val
,
other
);
}
}
void
operator
-=
(
const
fvec4
&
other
)
{
void
operator
-=
(
fvec4
other
)
{
val
=
_mm_sub_ps
(
val
,
other
);
val
=
_mm_sub_ps
(
val
,
other
);
}
}
void
operator
*=
(
const
fvec4
&
other
)
{
void
operator
*=
(
fvec4
other
)
{
val
=
_mm_mul_ps
(
val
,
other
);
val
=
_mm_mul_ps
(
val
,
other
);
}
}
void
operator
/=
(
const
fvec4
&
other
)
{
void
operator
/=
(
fvec4
other
)
{
val
=
_mm_div_ps
(
val
,
other
);
val
=
_mm_div_ps
(
val
,
other
);
}
}
fvec4
operator
-
()
const
{
fvec4
operator
-
()
const
{
return
_mm_sub_ps
(
_mm_set1_ps
(
0.0
f
),
val
);
return
_mm_sub_ps
(
_mm_set1_ps
(
0.0
f
),
val
);
}
}
fvec4
operator
&
(
const
fvec4
&
other
)
const
{
fvec4
operator
&
(
fvec4
other
)
const
{
return
_mm_and_ps
(
val
,
other
);
return
_mm_and_ps
(
val
,
other
);
}
}
fvec4
operator
|
(
const
fvec4
&
other
)
const
{
fvec4
operator
|
(
fvec4
other
)
const
{
return
_mm_or_ps
(
val
,
other
);
return
_mm_or_ps
(
val
,
other
);
}
}
fvec4
operator
==
(
const
fvec4
&
other
)
const
{
fvec4
operator
==
(
fvec4
other
)
const
{
return
_mm_cmpeq_ps
(
val
,
other
);
return
_mm_cmpeq_ps
(
val
,
other
);
}
}
fvec4
operator
!=
(
const
fvec4
&
other
)
const
{
fvec4
operator
!=
(
fvec4
other
)
const
{
return
_mm_cmpneq_ps
(
val
,
other
);
return
_mm_cmpneq_ps
(
val
,
other
);
}
}
fvec4
operator
>
(
const
fvec4
&
other
)
const
{
fvec4
operator
>
(
fvec4
other
)
const
{
return
_mm_cmpgt_ps
(
val
,
other
);
return
_mm_cmpgt_ps
(
val
,
other
);
}
}
fvec4
operator
<
(
const
fvec4
&
other
)
const
{
fvec4
operator
<
(
fvec4
other
)
const
{
return
_mm_cmplt_ps
(
val
,
other
);
return
_mm_cmplt_ps
(
val
,
other
);
}
}
fvec4
operator
>=
(
const
fvec4
&
other
)
const
{
fvec4
operator
>=
(
fvec4
other
)
const
{
return
_mm_cmpge_ps
(
val
,
other
);
return
_mm_cmpge_ps
(
val
,
other
);
}
}
fvec4
operator
<=
(
const
fvec4
&
other
)
const
{
fvec4
operator
<=
(
fvec4
other
)
const
{
return
_mm_cmple_ps
(
val
,
other
);
return
_mm_cmple_ps
(
val
,
other
);
}
}
operator
ivec4
()
const
;
operator
ivec4
()
const
;
...
@@ -191,49 +191,49 @@ public:
...
@@ -191,49 +191,49 @@ public:
void
store
(
int
*
v
)
const
{
void
store
(
int
*
v
)
const
{
_mm_storeu_si128
((
__m128i
*
)
v
,
val
);
_mm_storeu_si128
((
__m128i
*
)
v
,
val
);
}
}
ivec4
operator
+
(
const
ivec4
&
other
)
const
{
ivec4
operator
+
(
ivec4
other
)
const
{
return
_mm_add_epi32
(
val
,
other
);
return
_mm_add_epi32
(
val
,
other
);
}
}
ivec4
operator
-
(
const
ivec4
&
other
)
const
{
ivec4
operator
-
(
ivec4
other
)
const
{
return
_mm_sub_epi32
(
val
,
other
);
return
_mm_sub_epi32
(
val
,
other
);
}
}
ivec4
operator
*
(
const
ivec4
&
other
)
const
{
ivec4
operator
*
(
ivec4
other
)
const
{
return
_mm_mullo_epi32
(
val
,
other
);
return
_mm_mullo_epi32
(
val
,
other
);
}
}
void
operator
+=
(
const
ivec4
&
other
)
{
void
operator
+=
(
ivec4
other
)
{
val
=
_mm_add_epi32
(
val
,
other
);
val
=
_mm_add_epi32
(
val
,
other
);
}
}
void
operator
-=
(
const
ivec4
&
other
)
{
void
operator
-=
(
ivec4
other
)
{
val
=
_mm_sub_epi32
(
val
,
other
);
val
=
_mm_sub_epi32
(
val
,
other
);
}
}
void
operator
*=
(
const
ivec4
&
other
)
{
void
operator
*=
(
ivec4
other
)
{
val
=
_mm_mullo_epi32
(
val
,
other
);
val
=
_mm_mullo_epi32
(
val
,
other
);
}
}
ivec4
operator
-
()
const
{
ivec4
operator
-
()
const
{
return
_mm_sub_epi32
(
_mm_set1_epi32
(
0
),
val
);
return
_mm_sub_epi32
(
_mm_set1_epi32
(
0
),
val
);
}
}
ivec4
operator
&
(
const
ivec4
&
other
)
const
{
ivec4
operator
&
(
ivec4
other
)
const
{
return
_mm_and_si128
(
val
,
other
);
return
_mm_and_si128
(
val
,
other
);
}
}
ivec4
operator
|
(
const
ivec4
&
other
)
const
{
ivec4
operator
|
(
ivec4
other
)
const
{
return
_mm_or_si128
(
val
,
other
);
return
_mm_or_si128
(
val
,
other
);
}
}
ivec4
operator
==
(
const
ivec4
&
other
)
const
{
ivec4
operator
==
(
ivec4
other
)
const
{
return
_mm_cmpeq_epi32
(
val
,
other
);
return
_mm_cmpeq_epi32
(
val
,
other
);
}
}
ivec4
operator
!=
(
const
ivec4
&
other
)
const
{
ivec4
operator
!=
(
ivec4
other
)
const
{
return
_mm_xor_si128
(
*
this
==
other
,
_mm_set1_epi32
(
0xFFFFFFFF
));
return
_mm_xor_si128
(
*
this
==
other
,
_mm_set1_epi32
(
0xFFFFFFFF
));
}
}
ivec4
operator
>
(
const
ivec4
&
other
)
const
{
ivec4
operator
>
(
ivec4
other
)
const
{
return
_mm_cmpgt_epi32
(
val
,
other
);
return
_mm_cmpgt_epi32
(
val
,
other
);
}
}
ivec4
operator
<
(
const
ivec4
&
other
)
const
{
ivec4
operator
<
(
ivec4
other
)
const
{
return
_mm_cmplt_epi32
(
val
,
other
);
return
_mm_cmplt_epi32
(
val
,
other
);
}
}
ivec4
operator
>=
(
const
ivec4
&
other
)
const
{
ivec4
operator
>=
(
ivec4
other
)
const
{
return
_mm_xor_si128
(
_mm_cmplt_epi32
(
val
,
other
),
_mm_set1_epi32
(
0xFFFFFFFF
));
return
_mm_xor_si128
(
_mm_cmplt_epi32
(
val
,
other
),
_mm_set1_epi32
(
0xFFFFFFFF
));
}
}
ivec4
operator
<=
(
const
ivec4
&
other
)
const
{
ivec4
operator
<=
(
ivec4
other
)
const
{
return
_mm_xor_si128
(
_mm_cmpgt_epi32
(
val
,
other
),
_mm_set1_epi32
(
0xFFFFFFFF
));
return
_mm_xor_si128
(
_mm_cmpgt_epi32
(
val
,
other
),
_mm_set1_epi32
(
0xFFFFFFFF
));
}
}
operator
fvec4
()
const
;
operator
fvec4
()
const
;
...
@@ -258,36 +258,36 @@ inline fvec4 fvec4::expandBitsToMask(int bitmask) {
...
@@ -258,36 +258,36 @@ inline fvec4 fvec4::expandBitsToMask(int bitmask) {
// Functions that operate on fvec4s.
// Functions that operate on fvec4s.
static
inline
fvec4
floor
(
const
fvec4
&
v
)
{
static
inline
fvec4
floor
(
fvec4
v
)
{
return
fvec4
(
_mm_floor_ps
(
v
.
val
));
return
fvec4
(
_mm_floor_ps
(
v
.
val
));
}
}
static
inline
fvec4
ceil
(
const
fvec4
&
v
)
{
static
inline
fvec4
ceil
(
fvec4
v
)
{
return
fvec4
(
_mm_ceil_ps
(
v
.
val
));
return
fvec4
(
_mm_ceil_ps
(
v
.
val
));
}
}
static
inline
fvec4
round
(
const
fvec4
&
v
)
{
static
inline
fvec4
round
(
fvec4
v
)
{
return
fvec4
(
_mm_round_ps
(
v
.
val
,
_MM_FROUND_TO_NEAREST_INT
));
return
fvec4
(
_mm_round_ps
(
v
.
val
,
_MM_FROUND_TO_NEAREST_INT
));
}
}
static
inline
fvec4
min
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
min
(
fvec4
v1
,
fvec4
v2
)
{
return
fvec4
(
_mm_min_ps
(
v1
.
val
,
v2
.
val
));
return
fvec4
(
_mm_min_ps
(
v1
.
val
,
v2
.
val
));
}
}
static
inline
fvec4
max
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
max
(
fvec4
v1
,
fvec4
v2
)
{
return
fvec4
(
_mm_max_ps
(
v1
.
val
,
v2
.
val
));
return
fvec4
(
_mm_max_ps
(
v1
.
val
,
v2
.
val
));
}
}
static
inline
fvec4
abs
(
const
fvec4
&
v
)
{
static
inline
fvec4
abs
(
fvec4
v
)
{
static
const
__m128
mask
=
_mm_castsi128_ps
(
_mm_set1_epi32
(
0x7FFFFFFF
));
static
const
__m128
mask
=
_mm_castsi128_ps
(
_mm_set1_epi32
(
0x7FFFFFFF
));
return
fvec4
(
_mm_and_ps
(
v
.
val
,
mask
));
return
fvec4
(
_mm_and_ps
(
v
.
val
,
mask
));
}
}
static
inline
fvec4
sqrt
(
const
fvec4
&
v
)
{
static
inline
fvec4
sqrt
(
fvec4
v
)
{
return
fvec4
(
_mm_sqrt_ps
(
v
.
val
));
return
fvec4
(
_mm_sqrt_ps
(
v
.
val
));
}
}
static
inline
fvec4
rsqrt
(
const
fvec4
&
v
)
{
static
inline
fvec4
rsqrt
(
fvec4
v
)
{
// Initial estimate of rsqrt().
// Initial estimate of rsqrt().
fvec4
y
(
_mm_rsqrt_ps
(
v
.
val
));
fvec4
y
(
_mm_rsqrt_ps
(
v
.
val
));
...
@@ -299,27 +299,27 @@ static inline fvec4 rsqrt(const fvec4& v) {
...
@@ -299,27 +299,27 @@ static inline fvec4 rsqrt(const fvec4& v) {
return
y
;
return
y
;
}
}
static
inline
fvec4
exp
(
const
fvec4
&
v
)
{
static
inline
fvec4
exp
(
fvec4
v
)
{
return
fvec4
(
exp_ps
(
v
.
val
));
return
fvec4
(
exp_ps
(
v
.
val
));
}
}
static
inline
fvec4
log
(
const
fvec4
&
v
)
{
static
inline
fvec4
log
(
fvec4
v
)
{
return
fvec4
(
log_ps
(
v
.
val
));
return
fvec4
(
log_ps
(
v
.
val
));
}
}
static
inline
float
dot3
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
float
dot3
(
fvec4
v1
,
fvec4
v2
)
{
return
_mm_cvtss_f32
(
_mm_dp_ps
(
v1
,
v2
,
0x71
));
return
_mm_cvtss_f32
(
_mm_dp_ps
(
v1
,
v2
,
0x71
));
}
}
static
inline
float
dot4
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
float
dot4
(
fvec4
v1
,
fvec4
v2
)
{
return
_mm_cvtss_f32
(
_mm_dp_ps
(
v1
,
v2
,
0xF1
));
return
_mm_cvtss_f32
(
_mm_dp_ps
(
v1
,
v2
,
0xF1
));
}
}
static
inline
float
reduceAdd
(
const
fvec4
v
)
{
static
inline
float
reduceAdd
(
fvec4
v
)
{
return
dot4
(
v
,
fvec4
(
1.0
f
));
return
dot4
(
v
,
fvec4
(
1.0
f
));
}
}
static
inline
fvec4
cross
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
cross
(
fvec4
v1
,
fvec4
v2
)
{
fvec4
temp
=
fvec4
(
_mm_mul_ps
(
v1
,
_mm_shuffle_ps
(
v2
,
v2
,
_MM_SHUFFLE
(
3
,
0
,
2
,
1
))))
-
fvec4
temp
=
fvec4
(
_mm_mul_ps
(
v1
,
_mm_shuffle_ps
(
v2
,
v2
,
_MM_SHUFFLE
(
3
,
0
,
2
,
1
))))
-
fvec4
(
_mm_mul_ps
(
v2
,
_mm_shuffle_ps
(
v1
,
v1
,
_MM_SHUFFLE
(
3
,
0
,
2
,
1
))));
fvec4
(
_mm_mul_ps
(
v2
,
_mm_shuffle_ps
(
v1
,
v1
,
_MM_SHUFFLE
(
3
,
0
,
2
,
1
))));
return
_mm_shuffle_ps
(
temp
,
temp
,
_MM_SHUFFLE
(
3
,
0
,
2
,
1
));
return
_mm_shuffle_ps
(
temp
,
temp
,
_MM_SHUFFLE
(
3
,
0
,
2
,
1
));
...
@@ -340,53 +340,53 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
...
@@ -340,53 +340,53 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
/**
/**
* Out-of-place transpose from named variables into an array.
* Out-of-place transpose from named variables into an array.
*/
*/
static
inline
void
transpose
(
const
fvec4
v0
,
const
fvec4
v1
,
const
fvec4
v2
,
const
fvec4
v3
,
fvec4
out
[
4
])
{
static
inline
void
transpose
(
fvec4
v0
,
fvec4
v1
,
fvec4
v2
,
fvec4
v3
,
fvec4
out
[
4
])
{
out
[
0
]
=
v0
;
out
[
1
]
=
v1
;
out
[
2
]
=
v2
;
out
[
3
]
=
v3
;
out
[
0
]
=
v0
;
out
[
1
]
=
v1
;
out
[
2
]
=
v2
;
out
[
3
]
=
v3
;
transpose
(
out
[
0
],
out
[
1
],
out
[
2
],
out
[
3
]);
transpose
(
out
[
0
],
out
[
1
],
out
[
2
],
out
[
3
]);
}
}
// Functions that operate on ivec4s.
// Functions that operate on ivec4s.
static
inline
ivec4
min
(
const
ivec4
&
v1
,
const
ivec4
&
v2
)
{
static
inline
ivec4
min
(
ivec4
v1
,
ivec4
v2
)
{
return
ivec4
(
_mm_min_epi32
(
v1
.
val
,
v2
.
val
));
return
ivec4
(
_mm_min_epi32
(
v1
.
val
,
v2
.
val
));
}
}
static
inline
ivec4
max
(
const
ivec4
&
v1
,
const
ivec4
&
v2
)
{
static
inline
ivec4
max
(
ivec4
v1
,
ivec4
v2
)
{
return
ivec4
(
_mm_max_epi32
(
v1
.
val
,
v2
.
val
));
return
ivec4
(
_mm_max_epi32
(
v1
.
val
,
v2
.
val
));
}
}
static
inline
ivec4
abs
(
const
ivec4
&
v
)
{
static
inline
ivec4
abs
(
ivec4
v
)
{
return
ivec4
(
_mm_abs_epi32
(
v
.
val
));
return
ivec4
(
_mm_abs_epi32
(
v
.
val
));
}
}
static
inline
bool
any
(
const
ivec4
&
v
)
{
static
inline
bool
any
(
ivec4
v
)
{
return
!
_mm_test_all_zeros
(
v
,
_mm_set1_epi32
(
0xFFFFFFFF
));
return
!
_mm_test_all_zeros
(
v
,
_mm_set1_epi32
(
0xFFFFFFFF
));
}
}
// Mathematical operators involving a scalar and a vector.
// Mathematical operators involving a scalar and a vector.
static
inline
fvec4
operator
+
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
+
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
+
v2
;
return
fvec4
(
v1
)
+
v2
;
}
}
static
inline
fvec4
operator
-
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
-
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
-
v2
;
return
fvec4
(
v1
)
-
v2
;
}
}
static
inline
fvec4
operator
*
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
*
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
*
v2
;
return
fvec4
(
v1
)
*
v2
;
}
}
static
inline
fvec4
operator
/
(
float
v1
,
const
fvec4
&
v2
)
{
static
inline
fvec4
operator
/
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
/
v2
;
return
fvec4
(
v1
)
/
v2
;
}
}
// Operations for blending fvec4
// Operations for blending fvec4
static
inline
fvec4
blend
(
const
fvec4
&
v1
,
const
fvec4
&
v2
,
const
fvec4
&
mask
)
{
static
inline
fvec4
blend
(
fvec4
v1
,
fvec4
v2
,
fvec4
mask
)
{
return
fvec4
(
_mm_blendv_ps
(
v1
.
val
,
v2
.
val
,
mask
.
val
));
return
fvec4
(
_mm_blendv_ps
(
v1
.
val
,
v2
.
val
,
mask
.
val
));
}
}
static
inline
fvec4
blendZero
(
const
fvec4
v
,
const
fvec4
mask
)
{
static
inline
fvec4
blendZero
(
fvec4
v
,
fvec4
mask
)
{
return
blend
(
0.0
f
,
v
,
mask
);
return
blend
(
0.0
f
,
v
,
mask
);
}
}
...
@@ -394,7 +394,7 @@ static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) {
...
@@ -394,7 +394,7 @@ static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) {
* of vectors. The first result vector contains the values at the given indexes, and the second
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
* result vector contains the values from each respective index+1.
*/
*/
static
inline
void
gatherVecPair
(
const
float
*
table
,
const
ivec4
index
,
fvec4
&
out0
,
fvec4
&
out1
)
{
static
inline
void
gatherVecPair
(
const
float
*
table
,
ivec4
index
,
fvec4
&
out0
,
fvec4
&
out1
)
{
fvec4
t0
(
table
+
index
[
0
]);
fvec4
t0
(
table
+
index
[
0
]);
fvec4
t1
(
table
+
index
[
1
]);
fvec4
t1
(
table
+
index
[
1
]);
fvec4
t2
(
table
+
index
[
2
]);
fvec4
t2
(
table
+
index
[
2
]);
...
@@ -417,7 +417,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
...
@@ -417,7 +417,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined
* output[3] = undefined
*/
*/
static
inline
fvec4
reduceToVec3
(
const
fvec4
x
,
const
fvec4
y
,
const
fvec4
z
)
{
static
inline
fvec4
reduceToVec3
(
fvec4
x
,
fvec4
y
,
fvec4
z
)
{
// :TODO: Could be made more efficient.
// :TODO: Could be made more efficient.
const
auto
nx
=
reduceAdd
(
x
);
const
auto
nx
=
reduceAdd
(
x
);
const
auto
ny
=
reduceAdd
(
y
);
const
auto
ny
=
reduceAdd
(
y
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment