Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
79fd69c3
Commit
79fd69c3
authored
Oct 23, 2013
by
peastman
Browse files
Merge pull request #182 from peastman/master
Lots of optimizations to CPU platform
parents
b5da7380
062a2db0
Changes
12
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1029 additions
and
350 deletions
+1029
-350
CMakeLists.txt
CMakeLists.txt
+1
-0
libraries/sfmt/include/sfmt/SFMT-sse2.h
libraries/sfmt/include/sfmt/SFMT-sse2.h
+16
-14
libraries/sfmt/src/SFMT.cpp
libraries/sfmt/src/SFMT.cpp
+2
-2
openmmapi/include/openmm/internal/vectorize.h
openmmapi/include/openmm/internal/vectorize.h
+248
-0
platforms/cpu/include/CpuNeighborList.h
platforms/cpu/include/CpuNeighborList.h
+21
-5
platforms/cpu/include/CpuNonbondedForce.h
platforms/cpu/include/CpuNonbondedForce.h
+30
-14
platforms/cpu/src/CpuKernels.cpp
platforms/cpu/src/CpuKernels.cpp
+33
-5
platforms/cpu/src/CpuNeighborList.cpp
platforms/cpu/src/CpuNeighborList.cpp
+325
-118
platforms/cpu/src/CpuNonbondedForce.cpp
platforms/cpu/src/CpuNonbondedForce.cpp
+253
-103
platforms/cpu/tests/TestCpuNeighborList.cpp
platforms/cpu/tests/TestCpuNeighborList.cpp
+16
-5
plugins/cpupme/src/CpuPmeKernels.cpp
plugins/cpupme/src/CpuPmeKernels.cpp
+82
-84
wrappers/python/src/swig_doxygen/swigInputConfig.py
wrappers/python/src/swig_doxygen/swigInputConfig.py
+2
-0
No files found.
CMakeLists.txt
View file @
79fd69c3
...
@@ -252,6 +252,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
...
@@ -252,6 +252,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
## OpenMM was previously installed there.
## OpenMM was previously installed there.
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include
)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include
)
ENDFOREACH
(
subdir
)
ENDFOREACH
(
subdir
)
SET_SOURCE_FILES_PROPERTIES
(
${
CMAKE_SOURCE_DIR
}
/libraries/sfmt/src/SFMT.cpp PROPERTIES COMPILE_FLAGS
"-msse2 -DHAVE_SSE2=1"
)
# If API wrappers are being generated, and add them to the build.
# If API wrappers are being generated, and add them to the build.
FIND_PROGRAM
(
GCCXML_PATH gccxml PATH
FIND_PROGRAM
(
GCCXML_PATH gccxml PATH
...
...
libraries/sfmt/include/sfmt/SFMT-sse2.h
View file @
79fd69c3
...
@@ -49,22 +49,23 @@ PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b,
...
@@ -49,22 +49,23 @@ PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b,
* This function fills the internal state array with pseudorandom
* This function fills the internal state array with pseudorandom
* integers.
* integers.
*/
*/
inline
static
void
gen_rand_all
(
void
)
{
inline
static
void
gen_rand_all
(
SFMT
&
sfmt
)
{
int
i
;
int
i
;
__m128i
r
,
r1
,
r2
,
mask
;
__m128i
r
,
r1
,
r2
,
mask
;
mask
=
_mm_set_epi32
(
MSK4
,
MSK3
,
MSK2
,
MSK1
);
mask
=
_mm_set_epi32
(
MSK4
,
MSK3
,
MSK2
,
MSK1
);
r1
=
_mm_load_si128
(
&
sfmt
[
N
-
2
].
si
);
SFMTData
&
data
=
*
sfmt
.
data
;
r2
=
_mm_load_si128
(
&
sfmt
[
N
-
1
].
si
);
r1
=
_mm_load_si128
(
&
data
.
sfmt
[
N
-
2
].
si
);
r2
=
_mm_load_si128
(
&
data
.
sfmt
[
N
-
1
].
si
);
for
(
i
=
0
;
i
<
N
-
POS1
;
i
++
)
{
for
(
i
=
0
;
i
<
N
-
POS1
;
i
++
)
{
r
=
mm_recursion
(
&
sfmt
[
i
].
si
,
&
sfmt
[
i
+
POS1
].
si
,
r1
,
r2
,
mask
);
r
=
mm_recursion
(
&
data
.
sfmt
[
i
].
si
,
&
data
.
sfmt
[
i
+
POS1
].
si
,
r1
,
r2
,
mask
);
_mm_store_si128
(
&
sfmt
[
i
].
si
,
r
);
_mm_store_si128
(
&
data
.
sfmt
[
i
].
si
,
r
);
r1
=
r2
;
r1
=
r2
;
r2
=
r
;
r2
=
r
;
}
}
for
(;
i
<
N
;
i
++
)
{
for
(;
i
<
N
;
i
++
)
{
r
=
mm_recursion
(
&
sfmt
[
i
].
si
,
&
sfmt
[
i
+
POS1
-
N
].
si
,
r1
,
r2
,
mask
);
r
=
mm_recursion
(
&
data
.
sfmt
[
i
].
si
,
&
data
.
sfmt
[
i
+
POS1
-
N
].
si
,
r1
,
r2
,
mask
);
_mm_store_si128
(
&
sfmt
[
i
].
si
,
r
);
_mm_store_si128
(
&
data
.
sfmt
[
i
].
si
,
r
);
r1
=
r2
;
r1
=
r2
;
r2
=
r
;
r2
=
r
;
}
}
...
@@ -77,21 +78,22 @@ inline static void gen_rand_all(void) {
...
@@ -77,21 +78,22 @@ inline static void gen_rand_all(void) {
* @param array an 128-bit array to be filled by pseudorandom numbers.
* @param array an 128-bit array to be filled by pseudorandom numbers.
* @param size number of 128-bit pesudorandom numbers to be generated.
* @param size number of 128-bit pesudorandom numbers to be generated.
*/
*/
inline
static
void
gen_rand_array
(
w128_t
*
array
,
int
size
)
{
inline
static
void
gen_rand_array
(
w128_t
*
array
,
int
size
,
SFMT
&
sfmt
)
{
int
i
,
j
;
int
i
,
j
;
__m128i
r
,
r1
,
r2
,
mask
;
__m128i
r
,
r1
,
r2
,
mask
;
mask
=
_mm_set_epi32
(
MSK4
,
MSK3
,
MSK2
,
MSK1
);
mask
=
_mm_set_epi32
(
MSK4
,
MSK3
,
MSK2
,
MSK1
);
r1
=
_mm_load_si128
(
&
sfmt
[
N
-
2
].
si
);
SFMTData
&
data
=
*
sfmt
.
data
;
r2
=
_mm_load_si128
(
&
sfmt
[
N
-
1
].
si
);
r1
=
_mm_load_si128
(
&
data
.
sfmt
[
N
-
2
].
si
);
r2
=
_mm_load_si128
(
&
data
.
sfmt
[
N
-
1
].
si
);
for
(
i
=
0
;
i
<
N
-
POS1
;
i
++
)
{
for
(
i
=
0
;
i
<
N
-
POS1
;
i
++
)
{
r
=
mm_recursion
(
&
sfmt
[
i
].
si
,
&
sfmt
[
i
+
POS1
].
si
,
r1
,
r2
,
mask
);
r
=
mm_recursion
(
&
data
.
sfmt
[
i
].
si
,
&
data
.
sfmt
[
i
+
POS1
].
si
,
r1
,
r2
,
mask
);
_mm_store_si128
(
&
array
[
i
].
si
,
r
);
_mm_store_si128
(
&
array
[
i
].
si
,
r
);
r1
=
r2
;
r1
=
r2
;
r2
=
r
;
r2
=
r
;
}
}
for
(;
i
<
N
;
i
++
)
{
for
(;
i
<
N
;
i
++
)
{
r
=
mm_recursion
(
&
sfmt
[
i
].
si
,
&
array
[
i
+
POS1
-
N
].
si
,
r1
,
r2
,
mask
);
r
=
mm_recursion
(
&
data
.
sfmt
[
i
].
si
,
&
array
[
i
+
POS1
-
N
].
si
,
r1
,
r2
,
mask
);
_mm_store_si128
(
&
array
[
i
].
si
,
r
);
_mm_store_si128
(
&
array
[
i
].
si
,
r
);
r1
=
r2
;
r1
=
r2
;
r2
=
r
;
r2
=
r
;
...
@@ -106,13 +108,13 @@ inline static void gen_rand_array(w128_t *array, int size) {
...
@@ -106,13 +108,13 @@ inline static void gen_rand_array(w128_t *array, int size) {
}
}
for
(
j
=
0
;
j
<
2
*
N
-
size
;
j
++
)
{
for
(
j
=
0
;
j
<
2
*
N
-
size
;
j
++
)
{
r
=
_mm_load_si128
(
&
array
[
j
+
size
-
N
].
si
);
r
=
_mm_load_si128
(
&
array
[
j
+
size
-
N
].
si
);
_mm_store_si128
(
&
sfmt
[
j
].
si
,
r
);
_mm_store_si128
(
&
data
.
sfmt
[
j
].
si
,
r
);
}
}
for
(;
i
<
size
;
i
++
)
{
for
(;
i
<
size
;
i
++
)
{
r
=
mm_recursion
(
&
array
[
i
-
N
].
si
,
&
array
[
i
+
POS1
-
N
].
si
,
r1
,
r2
,
r
=
mm_recursion
(
&
array
[
i
-
N
].
si
,
&
array
[
i
+
POS1
-
N
].
si
,
r1
,
r2
,
mask
);
mask
);
_mm_store_si128
(
&
array
[
i
].
si
,
r
);
_mm_store_si128
(
&
array
[
i
].
si
,
r
);
_mm_store_si128
(
&
sfmt
[
j
++
].
si
,
r
);
_mm_store_si128
(
&
data
.
sfmt
[
j
++
].
si
,
r
);
r1
=
r2
;
r1
=
r2
;
r2
=
r
;
r2
=
r
;
}
}
...
...
libraries/sfmt/src/SFMT.cpp
View file @
79fd69c3
...
@@ -144,9 +144,9 @@ inline static void swap(w128_t *array, int size);
...
@@ -144,9 +144,9 @@ inline static void swap(w128_t *array, int size);
#endif
#endif
#if defined(HAVE_ALTIVEC)
#if defined(HAVE_ALTIVEC)
#include "SFMT-alti.h"
#include "
sfmt/
SFMT-alti.h"
#elif defined(HAVE_SSE2)
#elif defined(HAVE_SSE2)
#include "SFMT-sse2.h"
#include "
sfmt/
SFMT-sse2.h"
#endif
#endif
/**
/**
...
...
openmmapi/include/openmm/internal/vectorize.h
0 → 100644
View file @
79fd69c3
#ifndef OPENMM_VECTORIZE_H_
#define OPENMM_VECTORIZE_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <smmintrin.h>
// This file defines classes and functions to simplify vectorizing code with SSE.
class
ivec4
;
/**
* A four element vector of floats.
*/
class
fvec4
{
public:
__m128
val
;
fvec4
()
{}
fvec4
(
float
v
)
:
val
(
_mm_set1_ps
(
v
))
{}
fvec4
(
float
v1
,
float
v2
,
float
v3
,
float
v4
)
:
val
(
_mm_set_ps
(
v4
,
v3
,
v2
,
v1
))
{}
fvec4
(
__m128
v
)
:
val
(
v
)
{}
fvec4
(
const
float
*
v
)
:
val
(
_mm_loadu_ps
(
v
))
{}
operator
__m128
()
const
{
return
val
;
}
float
operator
[](
int
i
)
const
{
int
resultBits
=
_mm_extract_ps
(
val
,
i
);
return
*
((
float
*
)
&
resultBits
);
}
void
store
(
float
*
v
)
const
{
_mm_storeu_ps
(
v
,
val
);
}
fvec4
operator
+
(
fvec4
other
)
const
{
return
_mm_add_ps
(
val
,
other
);
}
fvec4
operator
-
(
fvec4
other
)
const
{
return
_mm_sub_ps
(
val
,
other
);
}
fvec4
operator
*
(
fvec4
other
)
const
{
return
_mm_mul_ps
(
val
,
other
);
}
fvec4
operator
/
(
fvec4
other
)
const
{
return
_mm_div_ps
(
val
,
other
);
}
void
operator
+=
(
fvec4
other
)
{
val
=
_mm_add_ps
(
val
,
other
);
}
void
operator
-=
(
fvec4
other
)
{
val
=
_mm_sub_ps
(
val
,
other
);
}
void
operator
*=
(
fvec4
other
)
{
val
=
_mm_mul_ps
(
val
,
other
);
}
void
operator
/=
(
fvec4
other
)
{
val
=
_mm_div_ps
(
val
,
other
);
}
fvec4
operator
-
()
const
{
return
_mm_sub_ps
(
_mm_set1_ps
(
0.0
f
),
val
);
}
fvec4
operator
&
(
fvec4
other
)
const
{
return
_mm_and_ps
(
val
,
other
);
}
fvec4
operator
==
(
fvec4
other
)
const
{
return
_mm_cmpeq_ps
(
val
,
other
);
}
fvec4
operator
!=
(
fvec4
other
)
const
{
return
_mm_cmpneq_ps
(
val
,
other
);
}
fvec4
operator
>
(
fvec4
other
)
const
{
return
_mm_cmpgt_ps
(
val
,
other
);
}
fvec4
operator
<
(
fvec4
other
)
const
{
return
_mm_cmplt_ps
(
val
,
other
);
}
fvec4
operator
>=
(
fvec4
other
)
const
{
return
_mm_cmpge_ps
(
val
,
other
);
}
fvec4
operator
<=
(
fvec4
other
)
const
{
return
_mm_cmple_ps
(
val
,
other
);
}
operator
ivec4
()
const
;
};
/**
* A four element vector of ints.
*/
class
ivec4
{
public:
__m128i
val
;
ivec4
()
{}
ivec4
(
int
v
)
:
val
(
_mm_set1_epi32
(
v
))
{}
ivec4
(
int
v1
,
int
v2
,
int
v3
,
int
v4
)
:
val
(
_mm_set_epi32
(
v4
,
v3
,
v2
,
v1
))
{}
ivec4
(
__m128i
v
)
:
val
(
v
)
{}
ivec4
(
const
int
*
v
)
:
val
(
_mm_loadu_si128
((
const
__m128i
*
)
v
))
{}
operator
__m128i
()
const
{
return
val
;
}
int
operator
[](
int
i
)
const
{
return
_mm_extract_epi32
(
val
,
i
);
}
void
store
(
int
*
v
)
const
{
_mm_storeu_si128
((
__m128i
*
)
v
,
val
);
}
ivec4
operator
+
(
ivec4
other
)
const
{
return
_mm_add_epi32
(
val
,
other
);
}
ivec4
operator
-
(
ivec4
other
)
const
{
return
_mm_sub_epi32
(
val
,
other
);
}
ivec4
operator
*
(
ivec4
other
)
const
{
return
_mm_mul_epi32
(
val
,
other
);
}
void
operator
+=
(
ivec4
other
)
{
val
=
_mm_add_epi32
(
val
,
other
);
}
void
operator
-=
(
ivec4
other
)
{
val
=
_mm_sub_epi32
(
val
,
other
);
}
void
operator
*=
(
ivec4
other
)
{
val
=
_mm_mul_epi32
(
val
,
other
);
}
ivec4
operator
-
()
const
{
return
_mm_sub_epi32
(
_mm_set1_epi32
(
0
),
val
);
}
ivec4
operator
&
(
ivec4
other
)
const
{
return
_mm_and_si128
(
val
,
other
);
}
ivec4
operator
==
(
ivec4
other
)
const
{
return
_mm_cmpeq_epi32
(
val
,
other
);
}
operator
fvec4
()
const
;
};
// Conversion operators.
inline
fvec4
::
operator
ivec4
()
const
{
return
_mm_cvttps_epi32
(
val
);
}
inline
ivec4
::
operator
fvec4
()
const
{
return
_mm_cvtepi32_ps
(
val
);
}
// Functions that operate on fvec4s.
static
inline
fvec4
floor
(
fvec4
v
)
{
return
fvec4
(
_mm_floor_ps
(
v
.
val
));
}
static
inline
fvec4
ceil
(
fvec4
v
)
{
return
fvec4
(
_mm_ceil_ps
(
v
.
val
));
}
static
inline
fvec4
round
(
fvec4
v
)
{
return
fvec4
(
_mm_round_ps
(
v
.
val
,
_MM_FROUND_TO_NEAREST_INT
));
}
static
inline
fvec4
min
(
fvec4
v1
,
fvec4
v2
)
{
return
fvec4
(
_mm_min_ps
(
v1
.
val
,
v2
.
val
));
}
static
inline
fvec4
max
(
fvec4
v1
,
fvec4
v2
)
{
return
fvec4
(
_mm_max_ps
(
v1
.
val
,
v2
.
val
));
}
static
inline
fvec4
abs
(
fvec4
v
)
{
static
const
__m128
mask
=
_mm_castsi128_ps
(
_mm_set1_epi32
(
0x7FFFFFFF
));
return
fvec4
(
_mm_and_ps
(
v
.
val
,
mask
));
}
static
inline
fvec4
sqrt
(
fvec4
v
)
{
return
fvec4
(
_mm_sqrt_ps
(
v
.
val
));
}
static
inline
float
dot3
(
fvec4
v1
,
fvec4
v2
)
{
return
_mm_cvtss_f32
(
_mm_dp_ps
(
v1
,
v2
,
0x71
));
}
static
inline
float
dot4
(
fvec4
v1
,
fvec4
v2
)
{
return
_mm_cvtss_f32
(
_mm_dp_ps
(
v1
,
v2
,
0xF1
));
}
// Functions that operate on ivec4s.
static
inline
ivec4
min
(
ivec4
v1
,
ivec4
v2
)
{
return
ivec4
(
_mm_min_epi32
(
v1
.
val
,
v2
.
val
));
}
static
inline
ivec4
max
(
ivec4
v1
,
ivec4
v2
)
{
return
ivec4
(
_mm_max_epi32
(
v1
.
val
,
v2
.
val
));
}
static
inline
ivec4
abs
(
ivec4
v
)
{
return
ivec4
(
_mm_abs_epi32
(
v
.
val
));
}
// Mathematical operators involving a scalar and a vector.
static
inline
fvec4
operator
+
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
+
v2
;
}
static
inline
fvec4
operator
-
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
-
v2
;
}
static
inline
fvec4
operator
*
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
*
v2
;
}
static
inline
fvec4
operator
/
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
/
v2
;
}
#endif
/*OPENMM_VECTORIZE_H_*/
platforms/cpu/include/CpuNeighborList.h
View file @
79fd69c3
...
@@ -12,26 +12,42 @@ namespace OpenMM {
...
@@ -12,26 +12,42 @@ namespace OpenMM {
class
OPENMM_EXPORT_CPU
CpuNeighborList
{
class
OPENMM_EXPORT_CPU
CpuNeighborList
{
public:
public:
class
ThreadData
;
class
ThreadData
;
class
VoxelHash
;
class
Voxels
;
static
const
int
BlockSize
;
CpuNeighborList
();
CpuNeighborList
();
~
CpuNeighborList
();
~
CpuNeighborList
();
void
computeNeighborList
(
int
numAtoms
,
const
std
::
vector
<
float
>&
atomLocations
,
const
std
::
vector
<
std
::
set
<
int
>
>&
exclusions
,
void
computeNeighborList
(
int
numAtoms
,
const
std
::
vector
<
float
>&
atomLocations
,
const
std
::
vector
<
std
::
set
<
int
>
>&
exclusions
,
const
float
*
periodicBoxSize
,
bool
usePeriodic
,
float
maxDistance
);
const
float
*
periodicBoxSize
,
bool
usePeriodic
,
float
maxDistance
);
const
std
::
vector
<
std
::
pair
<
int
,
int
>
>&
getNeighbors
();
int
getNumBlocks
()
const
;
const
std
::
vector
<
int
>&
getSortedAtoms
()
const
;
const
std
::
vector
<
int
>&
getBlockNeighbors
(
int
blockIndex
)
const
;
const
std
::
vector
<
char
>&
getBlockExclusions
(
int
blockIndex
)
const
;
/**
/**
* This routine contains the code executed by each thread.
* This routine contains the code executed by each thread.
*/
*/
void
runThread
(
int
index
,
std
::
vector
<
std
::
pair
<
int
,
int
>
>&
threadNeighbors
);
void
runThread
(
int
index
);
private:
private:
/**
* This is called by the worker threads to wait until the master thread instructs them to advance.
*/
void
threadWait
();
/**
* This is called by the master thread to instruct all the worker threads to advance.
*/
void
advanceThreads
();
bool
isDeleted
;
bool
isDeleted
;
int
numThreads
,
waitCount
;
int
numThreads
,
waitCount
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
neighbors
;
std
::
vector
<
int
>
sortedAtoms
;
std
::
vector
<
std
::
vector
<
int
>
>
blockNeighbors
;
std
::
vector
<
std
::
vector
<
char
>
>
blockExclusions
;
std
::
vector
<
pthread_t
>
thread
;
std
::
vector
<
pthread_t
>
thread
;
std
::
vector
<
ThreadData
*>
threadData
;
std
::
vector
<
ThreadData
*>
threadData
;
pthread_cond_t
startCondition
,
endCondition
;
pthread_cond_t
startCondition
,
endCondition
;
pthread_mutex_t
lock
;
pthread_mutex_t
lock
;
// The following variables are used to make information accessible to the individual threads.
// The following variables are used to make information accessible to the individual threads.
VoxelHash
*
voxelHash
;
float
minx
,
maxx
,
miny
,
maxy
,
minz
,
maxz
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
atomBins
;
Voxels
*
voxels
;
const
std
::
vector
<
std
::
set
<
int
>
>*
exclusions
;
const
std
::
vector
<
std
::
set
<
int
>
>*
exclusions
;
const
float
*
atomLocations
;
const
float
*
atomLocations
;
const
float
*
periodicBoxSize
;
const
float
*
periodicBoxSize
;
...
...
platforms/cpu/include/CpuNonbondedForce.h
View file @
79fd69c3
...
@@ -25,14 +25,17 @@
...
@@ -25,14 +25,17 @@
#ifndef OPENMM_CPU_NONBONDED_FORCE_H__
#ifndef OPENMM_CPU_NONBONDED_FORCE_H__
#define OPENMM_CPU_NONBONDED_FORCE_H__
#define OPENMM_CPU_NONBONDED_FORCE_H__
#include "CpuNeighborList.h"
#include "ReferencePairIxn.h"
#include "ReferencePairIxn.h"
#include "openmm/internal/vectorize.h"
#include <pthread.h>
#include <pthread.h>
#include <set>
#include <set>
#include <utility>
#include <utility>
#include <vector>
#include <vector>
#include <smmintrin.h>
// ---------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------
namespace
OpenMM
{
class
CpuNonbondedForce
{
class
CpuNonbondedForce
{
public:
public:
class
ThreadData
;
class
ThreadData
;
...
@@ -63,7 +66,7 @@ class CpuNonbondedForce {
...
@@ -63,7 +66,7 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
void
setUseCutoff
(
float
distance
,
const
std
::
vector
<
std
::
pair
<
int
,
int
>
>
&
neighbors
,
float
solventDielectric
);
void
setUseCutoff
(
float
distance
,
const
CpuNeighborList
&
neighbors
,
float
solventDielectric
);
/**---------------------------------------------------------------------------------------
/**---------------------------------------------------------------------------------------
...
@@ -127,9 +130,9 @@ class CpuNonbondedForce {
...
@@ -127,9 +130,9 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
void
calculateReciprocalIxn
(
int
numberOfAtoms
,
float
*
posq
,
std
::
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
void
calculateReciprocalIxn
(
int
numberOfAtoms
,
float
*
posq
,
std
::
vector
<
RealVec
>&
atomCoordinates
,
const
std
::
vector
<
std
::
pair
<
float
,
float
>
>&
atomParameters
,
const
std
::
vector
<
std
::
set
<
int
>
>&
exclusions
,
const
std
::
vector
<
std
::
pair
<
float
,
float
>
>&
atomParameters
,
const
std
::
vector
<
std
::
set
<
int
>
>&
exclusions
,
std
::
vector
<
OpenMM
::
RealVec
>&
forces
,
float
*
totalEnergy
)
const
;
std
::
vector
<
RealVec
>&
forces
,
float
*
totalEnergy
)
const
;
/**---------------------------------------------------------------------------------------
/**---------------------------------------------------------------------------------------
...
@@ -159,14 +162,14 @@ private:
...
@@ -159,14 +162,14 @@ private:
bool
periodic
;
bool
periodic
;
bool
ewald
;
bool
ewald
;
bool
pme
;
bool
pme
;
const
std
::
vector
<
std
::
pair
<
int
,
int
>
>
*
neighborList
;
const
CpuNeighborList
*
neighborList
;
float
periodicBoxSize
[
3
];
float
periodicBoxSize
[
3
];
float
cutoffDistance
,
switchingDistance
;
float
cutoffDistance
,
switchingDistance
;
float
krf
,
crf
;
float
krf
,
crf
;
float
alphaEwald
;
float
alphaEwald
;
int
numRx
,
numRy
,
numRz
;
int
numRx
,
numRy
,
numRz
;
int
meshDim
[
3
];
int
meshDim
[
3
];
std
::
vector
<
float
>
ewaldScale
X
,
ewaldScaleY
,
ewaldScaleDeriv
;
std
::
vector
<
float
>
ewaldScale
Table
;
float
ewaldDX
,
ewaldDXInv
;
float
ewaldDX
,
ewaldDXInv
;
bool
isDeleted
;
bool
isDeleted
;
int
numThreads
,
waitCount
;
int
numThreads
,
waitCount
;
...
@@ -195,31 +198,42 @@ private:
...
@@ -195,31 +198,42 @@ private:
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
void
calculateOneIxn
(
int
atom1
,
int
atom2
,
float
*
forces
,
double
*
totalEnergy
,
const
__m128
&
boxSize
,
const
__m128
&
invBoxSize
);
void
calculateOneIxn
(
int
atom1
,
int
atom2
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
);
/**---------------------------------------------------------------------------------------
/**---------------------------------------------------------------------------------------
Calculate
LJ Coulomb pair ixn between two atoms
Calculate
all the interactions for one atom block.
@param atom1 the index of the first atom
@param blockIndex the index of the atom block
@param atom2 the index of the second atom
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void
calculateBlockIxn
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
);
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param forces force array (forces added)
@param totalEnergy total energy
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
void
calculate
One
EwaldIxn
(
int
atom1
,
int
atom2
,
float
*
forces
,
double
*
totalEnergy
,
const
__m128
&
boxSize
,
const
__m128
&
invBoxSize
);
void
calculate
Block
EwaldIxn
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
);
/**
/**
* Compute the displacement and squared distance between two points, optionally using
* Compute the displacement and squared distance between two points, optionally using
* periodic boundary conditions.
* periodic boundary conditions.
*/
*/
void
getDeltaR
(
const
__m128
&
posI
,
const
__m128
&
posJ
,
__m128
&
deltaR
,
float
&
r2
,
bool
periodic
,
const
__m128
&
boxSize
,
const
__m128
&
invBoxSize
)
const
;
void
getDeltaR
(
const
fvec4
&
posI
,
const
fvec4
&
posJ
,
fvec4
&
deltaR
,
float
&
r2
,
bool
periodic
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
const
;
/**
/**
* Compute a fast approximation to erfc(x).
* Compute a fast approximation to erfc(x).
*/
*/
static
f
loat
erfcApprox
(
f
loat
x
);
static
f
vec4
erfcApprox
(
f
vec4
x
);
/**
/**
* Create a lookup table for the scale factor used with Ewald and PME.
* Create a lookup table for the scale factor used with Ewald and PME.
...
@@ -229,9 +243,11 @@ private:
...
@@ -229,9 +243,11 @@ private:
/**
/**
* Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
* Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
*/
*/
f
loat
ewaldScaleFunction
(
f
loat
x
);
f
vec4
ewaldScaleFunction
(
f
vec4
x
);
};
};
}
// namespace OpenMM
// ---------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------
#endif // OPENMM_CPU_NONBONDED_FORCE_H__
#endif // OPENMM_CPU_NONBONDED_FORCE_H__
platforms/cpu/src/CpuKernels.cpp
View file @
79fd69c3
...
@@ -221,20 +221,48 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
...
@@ -221,20 +221,48 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
if
(
nonbondedMethod
!=
NoCutoff
)
{
if
(
nonbondedMethod
!=
NoCutoff
)
{
// Determine whether we need to recompute the neighbor list.
// Determine whether we need to recompute the neighbor list.
double
padding
=
0.1
*
nonbondedCutoff
;
double
padding
=
0.1
5
*
nonbondedCutoff
;
bool
needRecompute
=
false
;
bool
needRecompute
=
false
;
double
closeCutoff2
=
0.25
*
padding
*
padding
;
double
farCutoff2
=
0.5
*
padding
*
padding
;
int
maxNumMoved
=
numParticles
/
10
;
vector
<
int
>
moved
;
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
{
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
{
RealVec
delta
=
posData
[
i
]
-
lastPositions
[
i
];
RealVec
delta
=
posData
[
i
]
-
lastPositions
[
i
];
if
(
delta
.
dot
(
delta
)
>
0.25
*
padding
*
padding
)
{
double
dist2
=
delta
.
dot
(
delta
);
if
(
dist2
>
closeCutoff2
)
{
moved
.
push_back
(
i
);
if
(
dist2
>
farCutoff2
||
moved
.
size
()
>
maxNumMoved
)
{
needRecompute
=
true
;
needRecompute
=
true
;
break
;
break
;
}
}
}
}
}
if
(
!
needRecompute
&&
moved
.
size
()
>
0
)
{
// Some particles have moved further than half the padding distance. Look for pairs
// that are missing from the neighbor list.
int
numMoved
=
moved
.
size
();
double
cutoff2
=
nonbondedCutoff
*
nonbondedCutoff
;
for
(
int
i
=
1
;
i
<
numMoved
&&
!
needRecompute
;
i
++
)
for
(
int
j
=
0
;
j
<
i
;
j
++
)
{
RealVec
delta
=
posData
[
moved
[
i
]]
-
posData
[
moved
[
j
]];
if
(
delta
.
dot
(
delta
)
<
cutoff2
)
{
// These particles should interact. See if they are in the neighbor list.
RealVec
oldDelta
=
lastPositions
[
moved
[
i
]]
-
lastPositions
[
moved
[
j
]];
if
(
oldDelta
.
dot
(
oldDelta
)
>
cutoff2
)
{
needRecompute
=
true
;
break
;
}
}
}
}
if
(
needRecompute
)
{
if
(
needRecompute
)
{
neighborList
.
computeNeighborList
(
numParticles
,
posq
,
exclusions
,
floatBoxSize
,
periodic
||
ewald
||
pme
,
nonbondedCutoff
+
padding
);
neighborList
.
computeNeighborList
(
numParticles
,
posq
,
exclusions
,
floatBoxSize
,
periodic
||
ewald
||
pme
,
nonbondedCutoff
+
padding
);
lastPositions
=
posData
;
lastPositions
=
posData
;
}
}
nonbonded
.
setUseCutoff
(
nonbondedCutoff
,
neighborList
.
getNeighbors
()
,
rfDielectric
);
nonbonded
.
setUseCutoff
(
nonbondedCutoff
,
neighborList
,
rfDielectric
);
}
}
if
(
periodic
||
ewald
||
pme
)
{
if
(
periodic
||
ewald
||
pme
)
{
double
minAllowedSize
=
1.999999
*
nonbondedCutoff
;
double
minAllowedSize
=
1.999999
*
nonbondedCutoff
;
...
...
platforms/cpu/src/CpuNeighborList.cpp
View file @
79fd69c3
This diff is collapsed.
Click to expand it.
platforms/cpu/src/CpuNonbondedForce.cpp
View file @
79fd69c3
This diff is collapsed.
Click to expand it.
platforms/cpu/tests/TestCpuNeighborList.cpp
View file @
79fd69c3
...
@@ -39,6 +39,7 @@
...
@@ -39,6 +39,7 @@
#include "sfmt/SFMT.h"
#include "sfmt/SFMT.h"
#include <iostream>
#include <iostream>
#include <set>
#include <set>
#include <utility>
#include <vector>
#include <vector>
using
namespace
OpenMM
;
using
namespace
OpenMM
;
...
@@ -68,11 +69,20 @@ void testNeighborList(bool periodic) {
...
@@ -68,11 +69,20 @@ void testNeighborList(bool periodic) {
// Convert the neighbor list to a set for faster lookup.
// Convert the neighbor list to a set for faster lookup.
set
<
pair
<
int
,
int
>
>
neighbors
;
set
<
pair
<
int
,
int
>
>
neighbors
;
for
(
int
i
=
0
;
i
<
(
int
)
neighborList
.
getNeighbors
().
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
neighborList
.
getSortedAtoms
().
size
();
i
++
)
{
pair
<
int
,
int
>
entry
=
neighborList
.
getNeighbors
()[
i
];
int
blockIndex
=
i
/
CpuNeighborList
::
BlockSize
;
int
indexInBlock
=
i
-
blockIndex
*
CpuNeighborList
::
BlockSize
;
char
mask
=
1
<<
indexInBlock
;
for
(
int
j
=
0
;
j
<
(
int
)
neighborList
.
getBlockExclusions
(
blockIndex
).
size
();
j
++
)
{
if
((
neighborList
.
getBlockExclusions
(
blockIndex
)[
j
]
&
mask
)
==
0
)
{
int
atom1
=
neighborList
.
getSortedAtoms
()[
i
];
int
atom2
=
neighborList
.
getBlockNeighbors
(
blockIndex
)[
j
];
pair
<
int
,
int
>
entry
=
make_pair
(
min
(
atom1
,
atom2
),
max
(
atom1
,
atom2
));
ASSERT
(
neighbors
.
find
(
entry
)
==
neighbors
.
end
()
&&
neighbors
.
find
(
make_pair
(
entry
.
second
,
entry
.
first
))
==
neighbors
.
end
());
// No duplicates
ASSERT
(
neighbors
.
find
(
entry
)
==
neighbors
.
end
()
&&
neighbors
.
find
(
make_pair
(
entry
.
second
,
entry
.
first
))
==
neighbors
.
end
());
// No duplicates
neighbors
.
insert
(
entry
);
neighbors
.
insert
(
entry
);
}
}
}
}
// Check each particle pair and figure out whether they should be in the neighbor list.
// Check each particle pair and figure out whether they should be in the neighbor list.
...
@@ -90,7 +100,8 @@ void testNeighborList(bool periodic) {
...
@@ -90,7 +100,8 @@ void testNeighborList(bool periodic) {
if
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
>
cutoff
*
cutoff
)
if
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
>
cutoff
*
cutoff
)
shouldInclude
=
false
;
shouldInclude
=
false
;
bool
isIncluded
=
(
neighbors
.
find
(
make_pair
(
i
,
j
))
!=
neighbors
.
end
()
||
neighbors
.
find
(
make_pair
(
j
,
i
))
!=
neighbors
.
end
());
bool
isIncluded
=
(
neighbors
.
find
(
make_pair
(
i
,
j
))
!=
neighbors
.
end
()
||
neighbors
.
find
(
make_pair
(
j
,
i
))
!=
neighbors
.
end
());
ASSERT_EQUAL
(
shouldInclude
,
isIncluded
);
if
(
shouldInclude
)
ASSERT
(
isIncluded
);
}
}
}
}
...
...
plugins/cpupme/src/CpuPmeKernels.cpp
View file @
79fd69c3
...
@@ -35,9 +35,9 @@
...
@@ -35,9 +35,9 @@
#include "CpuPmeKernels.h"
#include "CpuPmeKernels.h"
#include "SimTKOpenMMRealType.h"
#include "SimTKOpenMMRealType.h"
#include "openmm/internal/hardware.h"
#include "openmm/internal/hardware.h"
#include "openmm/internal/vectorize.h"
#include <cmath>
#include <cmath>
#include <cstring>
#include <cstring>
#include <smmintrin.h>
using
namespace
OpenMM
;
using
namespace
OpenMM
;
using
namespace
std
;
using
namespace
std
;
...
@@ -47,52 +47,50 @@ static const int PME_ORDER = 5;
...
@@ -47,52 +47,50 @@ static const int PME_ORDER = 5;
bool
CpuCalcPmeReciprocalForceKernel
::
hasInitializedThreads
=
false
;
bool
CpuCalcPmeReciprocalForceKernel
::
hasInitializedThreads
=
false
;
int
CpuCalcPmeReciprocalForceKernel
::
numThreads
=
0
;
int
CpuCalcPmeReciprocalForceKernel
::
numThreads
=
0
;
#define EXTRACT_FLOAT(v, element) _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, element)))
static
void
spreadCharge
(
int
start
,
int
end
,
float
*
posq
,
float
*
grid
,
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
Vec3
periodicBoxSize
)
{
static
void
spreadCharge
(
int
start
,
int
end
,
float
*
posq
,
float
*
grid
,
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
Vec3
periodicBoxSize
)
{
float
temp
[
4
];
float
temp
[
4
];
__m128
boxSize
=
_mm_set_ps
(
0
,
(
float
)
periodicBoxSize
[
2
],
(
float
)
periodicBoxSize
[
1
],
(
float
)
periodicBoxSize
[
0
]
);
fvec4
boxSize
(
(
float
)
periodicBoxSize
[
0
],
(
float
)
periodicBoxSize
[
1
],
(
float
)
periodicBoxSize
[
2
],
0
);
__m128
invBoxSize
=
_mm_set_ps
(
0
,
(
float
)
(
1
/
periodicBoxSize
[
2
]),
(
float
)
(
1
/
periodicBoxSize
[
1
]),
(
float
)
(
1
/
periodicBoxSize
[
0
]));
fvec4
invBoxSize
(
(
float
)
(
1
/
periodicBoxSize
[
0
]),
(
float
)
(
1
/
periodicBoxSize
[
1
]),
(
float
)
(
1
/
periodicBoxSize
[
2
])
,
0
);
__m128
gridSize
=
_mm_set_ps
(
0
,
grid
z
,
gridy
,
grid
x
);
fvec4
gridSize
(
grid
x
,
gridy
,
grid
z
,
0
);
__m128i
gridSizeInt
=
_mm_set_epi32
(
0
,
grid
z
,
gridy
,
grid
x
);
ivec4
gridSizeInt
(
grid
x
,
gridy
,
grid
z
,
0
);
__m128
one
=
_mm_set1_ps
(
1
);
fvec4
one
(
1
);
__m128
scale
=
_mm_set1_ps
(
1.0
f
/
(
PME_ORDER
-
1
));
fvec4
scale
(
1.0
f
/
(
PME_ORDER
-
1
));
const
float
epsilonFactor
=
sqrt
(
ONE_4PI_EPS0
);
const
float
epsilonFactor
=
sqrt
(
ONE_4PI_EPS0
);
memset
(
grid
,
0
,
sizeof
(
float
)
*
gridx
*
gridy
*
gridz
);
memset
(
grid
,
0
,
sizeof
(
float
)
*
gridx
*
gridy
*
gridz
);
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
// Find the position relative to the nearest grid point.
// Find the position relative to the nearest grid point.
__m128
pos
=
_mm_loadu_p
s
(
&
posq
[
4
*
i
]);
fvec4
po
s
(
&
posq
[
4
*
i
]);
__m128
posFloor
=
_mm_
floor
_ps
(
_mm_mul_ps
(
pos
,
invBoxSize
)
)
;
fvec4
posFloor
=
floor
(
pos
*
invBoxSize
);
__m128
posInBox
=
_mm_sub_ps
(
pos
,
_mm_mul_ps
(
boxSize
,
posFloor
))
;
fvec4
posInBox
=
pos
-
boxSize
*
posFloor
;
__m128
t
=
_mm_mul_ps
(
_mm_mul_ps
(
posInBox
,
invBoxSize
),
gridSize
)
;
fvec4
t
=
posInBox
*
invBoxSize
*
gridSize
;
__m128i
ti
=
_mm_cvttps_epi32
(
t
)
;
ivec4
ti
=
t
;
__m128
dr
=
_mm_sub_ps
(
t
,
_mm_cvtepi32_ps
(
ti
))
;
fvec4
dr
=
t
-
ti
;
__m128i
gridIndex
=
_mm_sub_epi32
(
ti
,
_mm_and_si128
(
gridSizeInt
,
_mm_cmpeq_epi32
(
ti
,
gridSizeInt
)
))
;
ivec4
gridIndex
=
ti
-
(
gridSizeInt
&
ti
==
gridSizeInt
);
// Compute the B-spline coefficients.
// Compute the B-spline coefficients.
__m128
data
[
PME_ORDER
];
fvec4
data
[
PME_ORDER
];
data
[
PME_ORDER
-
1
]
=
_mm_setzero_ps
()
;
data
[
PME_ORDER
-
1
]
=
0.0
f
;
data
[
1
]
=
dr
;
data
[
1
]
=
dr
;
data
[
0
]
=
_mm_sub_ps
(
one
,
dr
)
;
data
[
0
]
=
one
-
dr
;
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
__m128
div
=
_mm_set1_ps
(
1.0
f
/
(
j
-
1
));
fvec4
div
(
1.0
f
/
(
j
-
1
));
data
[
j
-
1
]
=
_mm_mul_ps
(
_mm_mul_ps
(
div
,
dr
),
data
[
j
-
2
]
)
;
data
[
j
-
1
]
=
div
*
dr
*
data
[
j
-
2
];
for
(
int
k
=
1
;
k
<
j
-
1
;
k
++
)
for
(
int
k
=
1
;
k
<
j
-
1
;
k
++
)
data
[
j
-
k
-
1
]
=
_mm_mul_ps
(
div
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_add_ps
(
dr
,
_mm_set1_ps
(
k
)),
data
[
j
-
k
-
2
]),
_mm_mul_ps
(
_mm_sub_ps
(
_mm_set1_ps
(
j
-
k
)
,
dr
)
,
data
[
j
-
k
-
1
])
))
;
data
[
j
-
k
-
1
]
=
div
*
((
dr
+
k
)
*
data
[
j
-
k
-
2
]
+
(
fvec4
(
j
-
k
)
-
dr
)
*
data
[
j
-
k
-
1
]);
data
[
0
]
=
_mm_mul_ps
(
_mm_mul_ps
(
div
,
_mm_sub_ps
(
one
,
dr
)
),
data
[
0
]
)
;
data
[
0
]
=
div
*
(
one
-
dr
)
*
data
[
0
];
}
}
data
[
PME_ORDER
-
1
]
=
_mm_mul_ps
(
_mm_mul_ps
(
scale
,
dr
),
data
[
PME_ORDER
-
2
]
)
;
data
[
PME_ORDER
-
1
]
=
scale
*
dr
*
data
[
PME_ORDER
-
2
];
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
data
[
PME_ORDER
-
j
-
1
]
=
_mm_mul_ps
(
scale
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_add_ps
(
dr
,
_mm_set1_ps
(
j
)),
data
[
PME_ORDER
-
j
-
2
]),
_mm_mul_ps
(
_mm_sub_ps
(
_mm_set1_ps
(
PME_ORDER
-
j
)
,
dr
)
,
data
[
PME_ORDER
-
j
-
1
])
))
;
data
[
PME_ORDER
-
j
-
1
]
=
scale
*
((
dr
+
j
)
*
data
[
PME_ORDER
-
j
-
2
]
+
(
fvec4
(
PME_ORDER
-
j
)
-
dr
)
*
data
[
PME_ORDER
-
j
-
1
]);
data
[
0
]
=
_mm_mul_ps
(
_mm_mul_ps
(
scale
,
_mm_sub_ps
(
one
,
dr
)
),
data
[
0
]
)
;
data
[
0
]
=
scale
*
(
one
-
dr
)
*
data
[
0
];
// Spread the charges.
// Spread the charges.
int
gridIndexX
=
_mm_extract_epi32
(
gridIndex
,
0
)
;
int
gridIndexX
=
gridIndex
[
0
]
;
int
gridIndexY
=
_mm_extract_epi32
(
gridIndex
,
1
)
;
int
gridIndexY
=
gridIndex
[
1
]
;
int
gridIndexZ
=
_mm_extract_epi32
(
gridIndex
,
2
)
;
int
gridIndexZ
=
gridIndex
[
2
]
;
if
(
gridIndexX
<
0
)
if
(
gridIndexX
<
0
)
return
;
// This happens when a simulation blows up and coordinates become NaN.
return
;
// This happens when a simulation blows up and coordinates become NaN.
int
zindex
[
PME_ORDER
];
int
zindex
[
PME_ORDER
];
...
@@ -101,21 +99,21 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx
...
@@ -101,21 +99,21 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx
zindex
[
j
]
-=
(
zindex
[
j
]
>=
gridz
?
gridz
:
0
);
zindex
[
j
]
-=
(
zindex
[
j
]
>=
gridz
?
gridz
:
0
);
}
}
float
charge
=
epsilonFactor
*
posq
[
4
*
i
+
3
];
float
charge
=
epsilonFactor
*
posq
[
4
*
i
+
3
];
__m128
zdata0to3
=
_mm_set_ps
(
EXTRACT_FLOAT
(
data
[
3
],
2
),
EXTRACT_FLOAT
(
data
[
2
],
2
),
EXTRACT_FLOAT
(
data
[
1
],
2
),
EXTRACT_FLOAT
(
data
[
0
],
2
)
);
fvec4
zdata0to3
(
data
[
0
][
2
],
data
[
1
][
2
],
data
[
2
][
2
],
data
[
3
][
2
]
);
float
zdata4
=
EXTRACT_FLOAT
(
data
[
4
]
,
2
)
;
float
zdata4
=
data
[
4
]
[
2
]
;
if
(
gridIndexZ
+
4
<
gridz
)
{
if
(
gridIndexZ
+
4
<
gridz
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndexX
+
ix
;
int
xbase
=
gridIndexX
+
ix
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
=
xbase
*
gridy
*
gridz
;
xbase
=
xbase
*
gridy
*
gridz
;
float
xdata
=
charge
*
EXTRACT_FLOAT
(
data
[
ix
]
,
0
)
;
float
xdata
=
charge
*
data
[
ix
]
[
0
]
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
ybase
=
xbase
+
ybase
*
gridz
;
float
multiplier
=
xdata
*
EXTRACT_FLOAT
(
data
[
iy
]
,
1
)
;
float
multiplier
=
xdata
*
data
[
iy
]
[
1
]
;
__m128
add0to3
=
_mm_mul_ps
(
zdata0to3
,
_mm_set1_ps
(
multiplier
))
;
fvec4
add0to3
=
zdata0to3
*
multiplier
;
_mm_storeu_ps
(
&
grid
[
ybase
+
gridIndexZ
]
,
_mm_add_ps
(
_mm_loadu_ps
(
&
grid
[
ybase
+
gridIndexZ
])
,
add0to3
))
;
(
fvec4
(
&
grid
[
ybase
+
gridIndexZ
]
)
+
add0to3
).
store
(
&
grid
[
ybase
+
gridIndexZ
]);
grid
[
ybase
+
zindex
[
4
]]
+=
multiplier
*
zdata4
;
grid
[
ybase
+
zindex
[
4
]]
+=
multiplier
*
zdata4
;
}
}
}
}
...
@@ -125,14 +123,14 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx
...
@@ -125,14 +123,14 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx
int
xbase
=
gridIndexX
+
ix
;
int
xbase
=
gridIndexX
+
ix
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
=
xbase
*
gridy
*
gridz
;
xbase
=
xbase
*
gridy
*
gridz
;
float
xdata
=
charge
*
EXTRACT_FLOAT
(
data
[
ix
]
,
0
)
;
float
xdata
=
charge
*
data
[
ix
]
[
0
]
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
ybase
=
xbase
+
ybase
*
gridz
;
float
multiplier
=
xdata
*
EXTRACT_FLOAT
(
data
[
iy
]
,
1
)
;
float
multiplier
=
xdata
*
data
[
iy
]
[
1
]
;
__m128
add0to3
=
_mm_mul_ps
(
zdata0to3
,
_mm_set1_ps
(
multiplier
))
;
fvec4
add0to3
=
zdata0to3
*
multiplier
;
_mm_
store
_ps
(
temp
,
add0to3
);
add0to3
.
store
(
temp
);
grid
[
ybase
+
zindex
[
0
]]
+=
temp
[
0
];
grid
[
ybase
+
zindex
[
0
]]
+=
temp
[
0
];
grid
[
ybase
+
zindex
[
1
]]
+=
temp
[
1
];
grid
[
ybase
+
zindex
[
1
]]
+=
temp
[
1
];
grid
[
ybase
+
zindex
[
2
]]
+=
temp
[
2
];
grid
[
ybase
+
zindex
[
2
]]
+=
temp
[
2
];
...
@@ -245,51 +243,51 @@ static void reciprocalConvolution(int start, int end, fftwf_complex* grid, int g
...
@@ -245,51 +243,51 @@ static void reciprocalConvolution(int start, int end, fftwf_complex* grid, int g
}
}
static
void
interpolateForces
(
int
start
,
int
end
,
float
*
posq
,
float
*
force
,
float
*
grid
,
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
Vec3
periodicBoxSize
)
{
static
void
interpolateForces
(
int
start
,
int
end
,
float
*
posq
,
float
*
force
,
float
*
grid
,
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
Vec3
periodicBoxSize
)
{
__m128
boxSize
=
_mm_set_ps
(
0
,
(
float
)
periodicBoxSize
[
2
],
(
float
)
periodicBoxSize
[
1
],
(
float
)
periodicBoxSize
[
0
]
);
fvec4
boxSize
(
(
float
)
periodicBoxSize
[
0
],
(
float
)
periodicBoxSize
[
1
],
(
float
)
periodicBoxSize
[
2
],
0
);
__m128
invBoxSize
=
_mm_set_ps
(
0
,
(
float
)
(
1
/
periodicBoxSize
[
2
]),
(
float
)
(
1
/
periodicBoxSize
[
1
]),
(
float
)
(
1
/
periodicBoxSize
[
0
]));
fvec4
invBoxSize
(
(
float
)
(
1
/
periodicBoxSize
[
0
]),
(
float
)
(
1
/
periodicBoxSize
[
1
]),
(
float
)
(
1
/
periodicBoxSize
[
2
])
,
0
);
__m128
gridSize
=
_mm_set_ps
(
0
,
grid
z
,
gridy
,
grid
x
);
fvec4
gridSize
(
grid
x
,
gridy
,
grid
z
,
0
);
__m128i
gridSizeInt
=
_mm_set_epi32
(
0
,
grid
z
,
gridy
,
grid
x
);
ivec4
gridSizeInt
(
grid
x
,
gridy
,
grid
z
,
0
);
__m128
one
=
_mm_set1_ps
(
1
);
fvec4
one
(
1
);
__m128
scale
=
_mm_set1_ps
(
1.0
f
/
(
PME_ORDER
-
1
));
fvec4
scale
(
1.0
f
/
(
PME_ORDER
-
1
));
const
float
epsilonFactor
=
sqrt
(
ONE_4PI_EPS0
);
const
float
epsilonFactor
=
sqrt
(
ONE_4PI_EPS0
);
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
// Find the position relative to the nearest grid point.
// Find the position relative to the nearest grid point.
__m128
pos
=
_mm_loadu_p
s
(
&
posq
[
4
*
i
]);
fvec4
po
s
(
&
posq
[
4
*
i
]);
__m128
posFloor
=
_mm_
floor
_ps
(
_mm_mul_ps
(
pos
,
invBoxSize
)
)
;
fvec4
posFloor
=
floor
(
pos
*
invBoxSize
);
__m128
posInBox
=
_mm_sub_ps
(
pos
,
_mm_mul_ps
(
boxSize
,
posFloor
))
;
fvec4
posInBox
=
pos
-
boxSize
*
posFloor
;
__m128
t
=
_mm_mul_ps
(
_mm_mul_ps
(
posInBox
,
invBoxSize
),
gridSize
)
;
fvec4
t
=
posInBox
*
invBoxSize
*
gridSize
;
__m128i
ti
=
_mm_cvttps_epi32
(
t
)
;
ivec4
ti
=
t
;
__m128
dr
=
_mm_sub_ps
(
t
,
_mm_cvtepi32_ps
(
ti
))
;
fvec4
dr
=
t
-
ti
;
__m128i
gridIndex
=
_mm_sub_epi32
(
ti
,
_mm_and_si128
(
gridSizeInt
,
_mm_cmpeq_epi32
(
ti
,
gridSizeInt
)
))
;
ivec4
gridIndex
=
ti
-
(
gridSizeInt
&
ti
==
gridSizeInt
);
// Compute the B-spline coefficients.
// Compute the B-spline coefficients.
__m128
data
[
PME_ORDER
];
fvec4
data
[
PME_ORDER
];
__m128
ddata
[
PME_ORDER
];
fvec4
ddata
[
PME_ORDER
];
data
[
PME_ORDER
-
1
]
=
_mm_setzero_ps
()
;
data
[
PME_ORDER
-
1
]
=
0.0
f
;
data
[
1
]
=
dr
;
data
[
1
]
=
dr
;
data
[
0
]
=
_mm_sub_ps
(
one
,
dr
)
;
data
[
0
]
=
one
-
dr
;
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
__m128
div
=
_mm_set1_ps
(
1.0
f
/
(
j
-
1
));
fvec4
div
(
1.0
f
/
(
j
-
1
));
data
[
j
-
1
]
=
_mm_mul_ps
(
_mm_mul_ps
(
div
,
dr
),
data
[
j
-
2
]
)
;
data
[
j
-
1
]
=
div
*
dr
*
data
[
j
-
2
];
for
(
int
k
=
1
;
k
<
j
-
1
;
k
++
)
for
(
int
k
=
1
;
k
<
j
-
1
;
k
++
)
data
[
j
-
k
-
1
]
=
_mm_mul_ps
(
div
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_add_ps
(
dr
,
_mm_set1_ps
(
k
)),
data
[
j
-
k
-
2
]),
_mm_mul_ps
(
_mm_sub_ps
(
_mm_set1_ps
(
j
-
k
)
,
dr
)
,
data
[
j
-
k
-
1
])
))
;
data
[
j
-
k
-
1
]
=
div
*
((
dr
+
k
)
*
data
[
j
-
k
-
2
]
+
(
fvec4
(
j
-
k
)
-
dr
)
*
data
[
j
-
k
-
1
]);
data
[
0
]
=
_mm_mul_ps
(
_mm_mul_ps
(
div
,
_mm_sub_ps
(
one
,
dr
)
),
data
[
0
]
)
;
data
[
0
]
=
div
*
(
one
-
dr
)
*
data
[
0
];
}
}
ddata
[
0
]
=
_mm_sub_ps
(
_mm_set1_ps
(
0
),
data
[
0
]
)
;
ddata
[
0
]
=
-
data
[
0
];
for
(
int
j
=
1
;
j
<
PME_ORDER
;
j
++
)
for
(
int
j
=
1
;
j
<
PME_ORDER
;
j
++
)
ddata
[
j
]
=
_mm_sub_ps
(
data
[
j
-
1
]
,
data
[
j
]
)
;
ddata
[
j
]
=
data
[
j
-
1
]
-
data
[
j
];
data
[
PME_ORDER
-
1
]
=
_mm_mul_ps
(
_mm_mul_ps
(
scale
,
dr
),
data
[
PME_ORDER
-
2
]
)
;
data
[
PME_ORDER
-
1
]
=
scale
*
dr
*
data
[
PME_ORDER
-
2
];
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
data
[
PME_ORDER
-
j
-
1
]
=
_mm_mul_ps
(
scale
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_add_ps
(
dr
,
_mm_set1_ps
(
j
)),
data
[
PME_ORDER
-
j
-
2
]),
_mm_mul_ps
(
_mm_sub_ps
(
_mm_set1_ps
(
PME_ORDER
-
j
)
,
dr
)
,
data
[
PME_ORDER
-
j
-
1
])
))
;
data
[
PME_ORDER
-
j
-
1
]
=
scale
*
((
dr
+
j
)
*
data
[
PME_ORDER
-
j
-
2
]
+
(
fvec4
(
PME_ORDER
-
j
)
-
dr
)
*
data
[
PME_ORDER
-
j
-
1
]);
data
[
0
]
=
_mm_mul_ps
(
_mm_mul_ps
(
scale
,
_mm_sub_ps
(
one
,
dr
)
),
data
[
0
]
)
;
data
[
0
]
=
scale
*
(
one
-
dr
)
*
data
[
0
];
// Compute the force on this atom.
// Compute the force on this atom.
int
gridIndexX
=
_mm_extract_epi32
(
gridIndex
,
0
)
;
int
gridIndexX
=
gridIndex
[
0
]
;
int
gridIndexY
=
_mm_extract_epi32
(
gridIndex
,
1
)
;
int
gridIndexY
=
gridIndex
[
1
]
;
int
gridIndexZ
=
_mm_extract_epi32
(
gridIndex
,
2
)
;
int
gridIndexZ
=
gridIndex
[
2
]
;
if
(
gridIndexX
<
0
)
if
(
gridIndexX
<
0
)
return
;
// This happens when a simulation blows up and coordinates become NaN.
return
;
// This happens when a simulation blows up and coordinates become NaN.
int
zindex
[
PME_ORDER
];
int
zindex
[
PME_ORDER
];
...
@@ -297,34 +295,34 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo
...
@@ -297,34 +295,34 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo
zindex
[
j
]
=
gridIndexZ
+
j
;
zindex
[
j
]
=
gridIndexZ
+
j
;
zindex
[
j
]
-=
(
zindex
[
j
]
>=
gridz
?
gridz
:
0
);
zindex
[
j
]
-=
(
zindex
[
j
]
>=
gridz
?
gridz
:
0
);
}
}
__m128
zdata
[
PME_ORDER
];
fvec4
zdata
[
PME_ORDER
];
for
(
int
j
=
0
;
j
<
PME_ORDER
;
j
++
)
for
(
int
j
=
0
;
j
<
PME_ORDER
;
j
++
)
zdata
[
j
]
=
_mm_set_ps
(
0
,
EXTRACT_FLOAT
(
ddata
[
j
],
2
),
EXTRACT_FLOAT
(
data
[
j
],
2
),
EXTRACT_FLOAT
(
data
[
j
]
,
2
)
);
zdata
[
j
]
=
fvec4
(
data
[
j
][
2
],
data
[
j
][
2
],
d
data
[
j
]
[
2
],
0
);
__m128
f
=
_mm_set1_ps
(
0
)
;
fvec4
f
=
0.0
f
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndexX
+
ix
;
int
xbase
=
gridIndexX
+
ix
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
=
xbase
*
gridy
*
gridz
;
xbase
=
xbase
*
gridy
*
gridz
;
float
dx
=
EXTRACT_FLOAT
(
data
[
ix
]
,
0
)
;
float
dx
=
data
[
ix
]
[
0
]
;
float
ddx
=
EXTRACT_FLOAT
(
ddata
[
ix
]
,
0
)
;
float
ddx
=
ddata
[
ix
]
[
0
]
;
__m128
xdata
=
_mm_set_ps
(
0
,
dx
,
dx
,
ddx
);
fvec4
xdata
(
ddx
,
dx
,
dx
,
0
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
ybase
=
xbase
+
ybase
*
gridz
;
float
dy
=
EXTRACT_FLOAT
(
data
[
iy
]
,
1
)
;
float
dy
=
data
[
iy
]
[
1
]
;
float
ddy
=
EXTRACT_FLOAT
(
ddata
[
iy
]
,
1
)
;
float
ddy
=
ddata
[
iy
]
[
1
]
;
__m128
xydata
=
_mm_mul_ps
(
xdata
,
_mm_set_ps
(
0
,
dy
,
ddy
,
dy
)
);
fvec4
xydata
=
xdata
*
fvec4
(
dy
,
ddy
,
dy
,
0
);
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
__m128
gridValue
=
_mm_set1_ps
(
grid
[
ybase
+
zindex
[
iz
]]);
fvec4
gridValue
(
grid
[
ybase
+
zindex
[
iz
]]);
f
=
_mm_add_ps
(
f
,
_mm_mul_ps
(
xydata
,
_mm_mul_ps
(
zdata
[
iz
]
,
gridValue
)))
;
f
=
f
+
xydata
*
zdata
[
iz
]
*
gridValue
;
}
}
}
}
}
}
f
=
_mm_mul_ps
(
invBoxSize
,
_mm_mul_ps
(
gridSize
,
_mm_mul_ps
(
f
,
_mm_set1_ps
(
-
epsilonFactor
*
posq
[
4
*
i
+
3
])
)))
;
f
=
invBoxSize
*
gridSize
*
f
*
(
-
epsilonFactor
*
posq
[
4
*
i
+
3
]);
_mm_
store
_ps
(
&
force
[
4
*
i
]
,
f
);
f
.
store
(
&
force
[
4
*
i
]
);
}
}
}
}
...
@@ -509,10 +507,10 @@ void CpuCalcPmeReciprocalForceKernel::runThread(int index) {
...
@@ -509,10 +507,10 @@ void CpuCalcPmeReciprocalForceKernel::runThread(int index) {
threadWait
();
threadWait
();
int
numGrids
=
threadData
.
size
();
int
numGrids
=
threadData
.
size
();
for
(
int
i
=
gridStart
;
i
<
gridEnd
;
i
+=
4
)
{
for
(
int
i
=
gridStart
;
i
<
gridEnd
;
i
+=
4
)
{
__m128
sum
=
_mm_load_ps
(
&
realGrid
[
i
]);
fvec4
sum
(
&
realGrid
[
i
]);
for
(
int
j
=
1
;
j
<
numGrids
;
j
++
)
for
(
int
j
=
1
;
j
<
numGrids
;
j
++
)
sum
=
_mm_add_ps
(
sum
,
_mm_load_ps
(
&
threadData
[
j
]
->
tempGrid
[
i
])
)
;
sum
+
=
fvec4
(
&
threadData
[
j
]
->
tempGrid
[
i
]);
_mm_
store
_ps
(
&
realGrid
[
i
]
,
sum
);
sum
.
store
(
&
realGrid
[
i
]);
}
}
threadWait
();
threadWait
();
if
(
lastBoxSize
!=
periodicBoxSize
)
{
if
(
lastBoxSize
!=
periodicBoxSize
)
{
...
...
wrappers/python/src/swig_doxygen/swigInputConfig.py
View file @
79fd69c3
...
@@ -145,6 +145,8 @@ SKIP_METHODS = [('State',),
...
@@ -145,6 +145,8 @@ SKIP_METHODS = [('State',),
(
'IntegrateDrudeSCFStepKernel'
,),
(
'IntegrateDrudeSCFStepKernel'
,),
(
'XmlSerializer'
,
'serialize'
),
(
'XmlSerializer'
,
'serialize'
),
(
'XmlSerializer'
,
'deserialize'
),
(
'XmlSerializer'
,
'deserialize'
),
(
'fvec4'
,),
(
'ivec4'
,),
]
]
# The build script assumes method args that are non-const references are
# The build script assumes method args that are non-const references are
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment