Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
4dc4b61b
Unverified
Commit
4dc4b61b
authored
Jun 15, 2020
by
peastman
Committed by
GitHub
Jun 15, 2020
Browse files
Merge pull request #2741 from dwtowner/cpu_avx_optimisations
CPU: Various optimisations which improve AVX/AVX2
parents
e4c1f73c
f8dc7aa1
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
42 additions
and
6 deletions
+42
-6
openmmapi/include/openmm/internal/vectorizeAvx2.h
openmmapi/include/openmm/internal/vectorizeAvx2.h
+32
-0
platforms/cpu/include/CpuNonbondedForceFvec.h
platforms/cpu/include/CpuNonbondedForceFvec.h
+10
-6
No files found.
openmmapi/include/openmm/internal/vectorizeAvx2.h
View file @
4dc4b61b
...
...
@@ -101,4 +101,36 @@ inline fvecAvx2 fvecAvx2::expandBitsToMask(int bitmask) {
return
_mm256_castsi256_ps
(
msb
);
}
/**
* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static
inline
void
gatherVecPair
(
const
float
*
table
,
ivec8
index
,
fvecAvx2
&
out0
,
fvecAvx2
&
out1
)
{
const
double
*
tableAsDbl
=
(
const
double
*
)
table
;
// The input is a set of 8 indexes, each of which refers to a pair of floating point
// values. The most efficient way to load from indexes in a vector is the gather instruction,
// and the 64-bit variant should be used to get the pairs.
// Given indexes ABCDEFGH, load the pairs corresponding to A C E G. This gives a set of
// 4 pairs. The high indexes (in the upper part of each 64-bit index) are cleared.
const
auto
lowerIdx
=
_mm256_and_si256
(
index
,
_mm256_set1_epi64x
(
0xFFFFFFFF
));
const
auto
lowerGather
=
_mm256_castpd_ps
(
_mm256_i64gather_pd
(
tableAsDbl
,
lowerIdx
,
4
));
// Load indexes B D F H, this time by shifting the high 32-bit indexes into the lower 32-bits.
const
auto
upperIdx
=
_mm256_srli_epi64
(
index
,
32
);
const
auto
upperGather
=
_mm256_castpd_ps
(
_mm256_i64gather_pd
(
tableAsDbl
,
upperIdx
,
4
));
// All the first values must now be brought together. The lower values are already in the
// correct place, but the upper gather values must be moved over and blended in.
const
auto
swapUpper
=
_mm256_permute_ps
(
upperGather
,
0b10110001
);
out0
=
fvecAvx2
(
_mm256_blend_ps
(
lowerGather
,
swapUpper
,
0b10101010
));
// And the same for the upper values.
const
auto
swapLower
=
_mm256_permute_ps
(
lowerGather
,
0b10110001
);
out1
=
fvecAvx2
(
_mm256_blend_ps
(
swapLower
,
upperGather
,
0b10101010
));
}
#endif
/*OPENMM_VECTORIZE_AVX2_H_*/
platforms/cpu/include/CpuNonbondedForceFvec.h
View file @
4dc4b61b
...
...
@@ -109,10 +109,8 @@ CpuNonbondedForceFvec<FVEC>::approximateFunctionFromTable(const std::vector<floa
FVEC
s1
,
s2
;
gatherVecPair
(
table
.
data
(),
index
,
s1
,
s2
);
const
auto
coeff2
=
x1
-
FVEC
(
index
);
const
auto
coeff1
=
1.0
f
-
coeff2
;
return
coeff1
*
s1
+
coeff2
*
s2
;
const
auto
coeff2
=
x1
-
FVEC
(
index
);
return
(
s1
-
coeff2
*
s1
)
+
coeff2
*
s2
;
}
template
<
typename
FVEC
>
...
...
@@ -216,6 +214,8 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
// Loop over neighbors for this block.
const
auto
&
neighbors
=
neighborList
->
getBlockNeighbors
(
blockIndex
);
const
auto
&
exclusions
=
neighborList
->
getBlockExclusions
(
blockIndex
);
FVEC
partialEnergy
=
{};
for
(
int
i
=
0
;
i
<
(
int
)
neighbors
.
size
();
i
++
)
{
// Load the next neighbor.
...
...
@@ -230,7 +230,7 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
getDeltaR
<
PERIODIC_TYPE
>
(
atomPos
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
boxSize
,
invBoxSize
);
const
auto
exclNotMask
=
FVEC
::
expandBitsToMask
(
~
exclusions
[
i
]);
const
auto
include
=
blendZero
(
r2
<
cutoffDistance
*
cutoffDistance
,
exclNotMask
);
const
auto
include
=
blendZero
(
r2
<
cutoffDistance
Squared
,
exclNotMask
);
if
(
!
any
(
include
))
continue
;
// No interactions to compute.
...
...
@@ -296,7 +296,8 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
energy
+=
chargeProd
*
inverseR
;
}
energy
=
blendZero
(
energy
,
include
);
*
totalEnergy
+=
reduceAdd
(
energy
);
partialEnergy
+=
energy
;
}
// Accumulate forces.
...
...
@@ -313,6 +314,9 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
newAtomForce
.
store
(
atomForce
);
}
if
(
totalEnergy
)
*
totalEnergy
+=
reduceAdd
(
partialEnergy
);
// Record the forces on the block atoms.
fvec4
f
[
blockSize
];
transpose
(
blockAtomForceX
,
blockAtomForceY
,
blockAtomForceZ
,
0.0
f
,
f
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment