Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
f8dc7aa1
Commit
f8dc7aa1
authored
Jun 15, 2020
by
Daniel Towner
Browse files
CPU: Various optimisations which improve AVX/AVX2
parent
e4c1f73c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
42 additions
and
6 deletions
+42
-6
openmmapi/include/openmm/internal/vectorizeAvx2.h
openmmapi/include/openmm/internal/vectorizeAvx2.h
+32
-0
platforms/cpu/include/CpuNonbondedForceFvec.h
platforms/cpu/include/CpuNonbondedForceFvec.h
+10
-6
No files found.
openmmapi/include/openmm/internal/vectorizeAvx2.h
View file @
f8dc7aa1
...
...
@@ -101,4 +101,36 @@ inline fvecAvx2 fvecAvx2::expandBitsToMask(int bitmask) {
return
_mm256_castsi256_ps
(
msb
);
}
/**
* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static
inline
void
gatherVecPair
(
const
float
*
table
,
ivec8
index
,
fvecAvx2
&
out0
,
fvecAvx2
&
out1
)
{
const
double
*
tableAsDbl
=
(
const
double
*
)
table
;
// The input is a set of 8 indexes, each of which refers to a pair of floating point
// values. The most efficient way to load from indexes in a vector is the gather instruction,
// and the 64-bit variant should be used to get the pairs.
// Given indexes ABCDEFGH, load the pairs corresponding to A C E G. This gives a set of
// 4 pairs. The high indexes (in the upper part of each 64-bit index) are cleared.
const
auto
lowerIdx
=
_mm256_and_si256
(
index
,
_mm256_set1_epi64x
(
0xFFFFFFFF
));
const
auto
lowerGather
=
_mm256_castpd_ps
(
_mm256_i64gather_pd
(
tableAsDbl
,
lowerIdx
,
4
));
// Load indexes B D F H, this time by shifting the high 32-bit indexes into the lower 32-bits.
const
auto
upperIdx
=
_mm256_srli_epi64
(
index
,
32
);
const
auto
upperGather
=
_mm256_castpd_ps
(
_mm256_i64gather_pd
(
tableAsDbl
,
upperIdx
,
4
));
// All the first values must now be brought together. The lower values are already in the
// correct place, but the upper gather values must be moved over and blended in.
const
auto
swapUpper
=
_mm256_permute_ps
(
upperGather
,
0b10110001
);
out0
=
fvecAvx2
(
_mm256_blend_ps
(
lowerGather
,
swapUpper
,
0b10101010
));
// And the same for the upper values.
const
auto
swapLower
=
_mm256_permute_ps
(
lowerGather
,
0b10110001
);
out1
=
fvecAvx2
(
_mm256_blend_ps
(
swapLower
,
upperGather
,
0b10101010
));
}
#endif
/*OPENMM_VECTORIZE_AVX2_H_*/
platforms/cpu/include/CpuNonbondedForceFvec.h
View file @
f8dc7aa1
...
...
@@ -109,10 +109,8 @@ CpuNonbondedForceFvec<FVEC>::approximateFunctionFromTable(const std::vector<floa
FVEC
s1
,
s2
;
gatherVecPair
(
table
.
data
(),
index
,
s1
,
s2
);
const
auto
coeff2
=
x1
-
FVEC
(
index
);
const
auto
coeff1
=
1.0
f
-
coeff2
;
return
coeff1
*
s1
+
coeff2
*
s2
;
const
auto
coeff2
=
x1
-
FVEC
(
index
);
return
(
s1
-
coeff2
*
s1
)
+
coeff2
*
s2
;
}
template
<
typename
FVEC
>
...
...
@@ -216,6 +214,8 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
// Loop over neighbors for this block.
const
auto
&
neighbors
=
neighborList
->
getBlockNeighbors
(
blockIndex
);
const
auto
&
exclusions
=
neighborList
->
getBlockExclusions
(
blockIndex
);
FVEC
partialEnergy
=
{};
for
(
int
i
=
0
;
i
<
(
int
)
neighbors
.
size
();
i
++
)
{
// Load the next neighbor.
...
...
@@ -230,7 +230,7 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
getDeltaR
<
PERIODIC_TYPE
>
(
atomPos
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
boxSize
,
invBoxSize
);
const
auto
exclNotMask
=
FVEC
::
expandBitsToMask
(
~
exclusions
[
i
]);
const
auto
include
=
blendZero
(
r2
<
cutoffDistance
*
cutoffDistance
,
exclNotMask
);
const
auto
include
=
blendZero
(
r2
<
cutoffDistance
Squared
,
exclNotMask
);
if
(
!
any
(
include
))
continue
;
// No interactions to compute.
...
...
@@ -296,7 +296,8 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
energy
+=
chargeProd
*
inverseR
;
}
energy
=
blendZero
(
energy
,
include
);
*
totalEnergy
+=
reduceAdd
(
energy
);
partialEnergy
+=
energy
;
}
// Accumulate forces.
...
...
@@ -313,6 +314,9 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
newAtomForce
.
store
(
atomForce
);
}
if
(
totalEnergy
)
*
totalEnergy
+=
reduceAdd
(
partialEnergy
);
// Record the forces on the block atoms.
fvec4
f
[
blockSize
];
transpose
(
blockAtomForceX
,
blockAtomForceY
,
blockAtomForceZ
,
0.0
f
,
f
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment