Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
79fd69c3
Commit
79fd69c3
authored
Oct 23, 2013
by
peastman
Browse files
Merge pull request #182 from peastman/master
Lots of optimizations to CPU platform
parents
b5da7380
062a2db0
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1029 additions
and
350 deletions
+1029
-350
CMakeLists.txt
CMakeLists.txt
+1
-0
libraries/sfmt/include/sfmt/SFMT-sse2.h
libraries/sfmt/include/sfmt/SFMT-sse2.h
+16
-14
libraries/sfmt/src/SFMT.cpp
libraries/sfmt/src/SFMT.cpp
+2
-2
openmmapi/include/openmm/internal/vectorize.h
openmmapi/include/openmm/internal/vectorize.h
+248
-0
platforms/cpu/include/CpuNeighborList.h
platforms/cpu/include/CpuNeighborList.h
+21
-5
platforms/cpu/include/CpuNonbondedForce.h
platforms/cpu/include/CpuNonbondedForce.h
+30
-14
platforms/cpu/src/CpuKernels.cpp
platforms/cpu/src/CpuKernels.cpp
+33
-5
platforms/cpu/src/CpuNeighborList.cpp
platforms/cpu/src/CpuNeighborList.cpp
+325
-118
platforms/cpu/src/CpuNonbondedForce.cpp
platforms/cpu/src/CpuNonbondedForce.cpp
+253
-103
platforms/cpu/tests/TestCpuNeighborList.cpp
platforms/cpu/tests/TestCpuNeighborList.cpp
+16
-5
plugins/cpupme/src/CpuPmeKernels.cpp
plugins/cpupme/src/CpuPmeKernels.cpp
+82
-84
wrappers/python/src/swig_doxygen/swigInputConfig.py
wrappers/python/src/swig_doxygen/swigInputConfig.py
+2
-0
No files found.
CMakeLists.txt
View file @
79fd69c3
...
...
@@ -252,6 +252,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
## OpenMM was previously installed there.
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include
)
ENDFOREACH
(
subdir
)
SET_SOURCE_FILES_PROPERTIES
(
${
CMAKE_SOURCE_DIR
}
/libraries/sfmt/src/SFMT.cpp PROPERTIES COMPILE_FLAGS
"-msse2 -DHAVE_SSE2=1"
)
# If API wrappers are being generated, and add them to the build.
FIND_PROGRAM
(
GCCXML_PATH gccxml PATH
...
...
libraries/sfmt/include/sfmt/SFMT-sse2.h
View file @
79fd69c3
...
...
@@ -49,22 +49,23 @@ PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b,
* This function fills the internal state array with pseudorandom
* integers.
*/
inline
static
void
gen_rand_all
(
void
)
{
inline
static
void
gen_rand_all
(
SFMT
&
sfmt
)
{
int
i
;
__m128i
r
,
r1
,
r2
,
mask
;
mask
=
_mm_set_epi32
(
MSK4
,
MSK3
,
MSK2
,
MSK1
);
r1
=
_mm_load_si128
(
&
sfmt
[
N
-
2
].
si
);
r2
=
_mm_load_si128
(
&
sfmt
[
N
-
1
].
si
);
SFMTData
&
data
=
*
sfmt
.
data
;
r1
=
_mm_load_si128
(
&
data
.
sfmt
[
N
-
2
].
si
);
r2
=
_mm_load_si128
(
&
data
.
sfmt
[
N
-
1
].
si
);
for
(
i
=
0
;
i
<
N
-
POS1
;
i
++
)
{
r
=
mm_recursion
(
&
sfmt
[
i
].
si
,
&
sfmt
[
i
+
POS1
].
si
,
r1
,
r2
,
mask
);
_mm_store_si128
(
&
sfmt
[
i
].
si
,
r
);
r
=
mm_recursion
(
&
data
.
sfmt
[
i
].
si
,
&
data
.
sfmt
[
i
+
POS1
].
si
,
r1
,
r2
,
mask
);
_mm_store_si128
(
&
data
.
sfmt
[
i
].
si
,
r
);
r1
=
r2
;
r2
=
r
;
}
for
(;
i
<
N
;
i
++
)
{
r
=
mm_recursion
(
&
sfmt
[
i
].
si
,
&
sfmt
[
i
+
POS1
-
N
].
si
,
r1
,
r2
,
mask
);
_mm_store_si128
(
&
sfmt
[
i
].
si
,
r
);
r
=
mm_recursion
(
&
data
.
sfmt
[
i
].
si
,
&
data
.
sfmt
[
i
+
POS1
-
N
].
si
,
r1
,
r2
,
mask
);
_mm_store_si128
(
&
data
.
sfmt
[
i
].
si
,
r
);
r1
=
r2
;
r2
=
r
;
}
...
...
@@ -77,21 +78,22 @@ inline static void gen_rand_all(void) {
* @param array an 128-bit array to be filled by pseudorandom numbers.
* @param size number of 128-bit pesudorandom numbers to be generated.
*/
inline
static
void
gen_rand_array
(
w128_t
*
array
,
int
size
)
{
inline
static
void
gen_rand_array
(
w128_t
*
array
,
int
size
,
SFMT
&
sfmt
)
{
int
i
,
j
;
__m128i
r
,
r1
,
r2
,
mask
;
mask
=
_mm_set_epi32
(
MSK4
,
MSK3
,
MSK2
,
MSK1
);
r1
=
_mm_load_si128
(
&
sfmt
[
N
-
2
].
si
);
r2
=
_mm_load_si128
(
&
sfmt
[
N
-
1
].
si
);
SFMTData
&
data
=
*
sfmt
.
data
;
r1
=
_mm_load_si128
(
&
data
.
sfmt
[
N
-
2
].
si
);
r2
=
_mm_load_si128
(
&
data
.
sfmt
[
N
-
1
].
si
);
for
(
i
=
0
;
i
<
N
-
POS1
;
i
++
)
{
r
=
mm_recursion
(
&
sfmt
[
i
].
si
,
&
sfmt
[
i
+
POS1
].
si
,
r1
,
r2
,
mask
);
r
=
mm_recursion
(
&
data
.
sfmt
[
i
].
si
,
&
data
.
sfmt
[
i
+
POS1
].
si
,
r1
,
r2
,
mask
);
_mm_store_si128
(
&
array
[
i
].
si
,
r
);
r1
=
r2
;
r2
=
r
;
}
for
(;
i
<
N
;
i
++
)
{
r
=
mm_recursion
(
&
sfmt
[
i
].
si
,
&
array
[
i
+
POS1
-
N
].
si
,
r1
,
r2
,
mask
);
r
=
mm_recursion
(
&
data
.
sfmt
[
i
].
si
,
&
array
[
i
+
POS1
-
N
].
si
,
r1
,
r2
,
mask
);
_mm_store_si128
(
&
array
[
i
].
si
,
r
);
r1
=
r2
;
r2
=
r
;
...
...
@@ -106,13 +108,13 @@ inline static void gen_rand_array(w128_t *array, int size) {
}
for
(
j
=
0
;
j
<
2
*
N
-
size
;
j
++
)
{
r
=
_mm_load_si128
(
&
array
[
j
+
size
-
N
].
si
);
_mm_store_si128
(
&
sfmt
[
j
].
si
,
r
);
_mm_store_si128
(
&
data
.
sfmt
[
j
].
si
,
r
);
}
for
(;
i
<
size
;
i
++
)
{
r
=
mm_recursion
(
&
array
[
i
-
N
].
si
,
&
array
[
i
+
POS1
-
N
].
si
,
r1
,
r2
,
mask
);
_mm_store_si128
(
&
array
[
i
].
si
,
r
);
_mm_store_si128
(
&
sfmt
[
j
++
].
si
,
r
);
_mm_store_si128
(
&
data
.
sfmt
[
j
++
].
si
,
r
);
r1
=
r2
;
r2
=
r
;
}
...
...
libraries/sfmt/src/SFMT.cpp
View file @
79fd69c3
...
...
@@ -144,9 +144,9 @@ inline static void swap(w128_t *array, int size);
#endif
#if defined(HAVE_ALTIVEC)
#include "SFMT-alti.h"
#include "
sfmt/
SFMT-alti.h"
#elif defined(HAVE_SSE2)
#include "SFMT-sse2.h"
#include "
sfmt/
SFMT-sse2.h"
#endif
/**
...
...
openmmapi/include/openmm/internal/vectorize.h
0 → 100644
View file @
79fd69c3
#ifndef OPENMM_VECTORIZE_H_
#define OPENMM_VECTORIZE_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <smmintrin.h>
// This file defines classes and functions to simplify vectorizing code with SSE.
class
ivec4
;
/**
* A four element vector of floats.
*/
class
fvec4
{
public:
__m128
val
;
fvec4
()
{}
fvec4
(
float
v
)
:
val
(
_mm_set1_ps
(
v
))
{}
fvec4
(
float
v1
,
float
v2
,
float
v3
,
float
v4
)
:
val
(
_mm_set_ps
(
v4
,
v3
,
v2
,
v1
))
{}
fvec4
(
__m128
v
)
:
val
(
v
)
{}
fvec4
(
const
float
*
v
)
:
val
(
_mm_loadu_ps
(
v
))
{}
operator
__m128
()
const
{
return
val
;
}
float
operator
[](
int
i
)
const
{
int
resultBits
=
_mm_extract_ps
(
val
,
i
);
return
*
((
float
*
)
&
resultBits
);
}
void
store
(
float
*
v
)
const
{
_mm_storeu_ps
(
v
,
val
);
}
fvec4
operator
+
(
fvec4
other
)
const
{
return
_mm_add_ps
(
val
,
other
);
}
fvec4
operator
-
(
fvec4
other
)
const
{
return
_mm_sub_ps
(
val
,
other
);
}
fvec4
operator
*
(
fvec4
other
)
const
{
return
_mm_mul_ps
(
val
,
other
);
}
fvec4
operator
/
(
fvec4
other
)
const
{
return
_mm_div_ps
(
val
,
other
);
}
void
operator
+=
(
fvec4
other
)
{
val
=
_mm_add_ps
(
val
,
other
);
}
void
operator
-=
(
fvec4
other
)
{
val
=
_mm_sub_ps
(
val
,
other
);
}
void
operator
*=
(
fvec4
other
)
{
val
=
_mm_mul_ps
(
val
,
other
);
}
void
operator
/=
(
fvec4
other
)
{
val
=
_mm_div_ps
(
val
,
other
);
}
fvec4
operator
-
()
const
{
return
_mm_sub_ps
(
_mm_set1_ps
(
0.0
f
),
val
);
}
fvec4
operator
&
(
fvec4
other
)
const
{
return
_mm_and_ps
(
val
,
other
);
}
fvec4
operator
==
(
fvec4
other
)
const
{
return
_mm_cmpeq_ps
(
val
,
other
);
}
fvec4
operator
!=
(
fvec4
other
)
const
{
return
_mm_cmpneq_ps
(
val
,
other
);
}
fvec4
operator
>
(
fvec4
other
)
const
{
return
_mm_cmpgt_ps
(
val
,
other
);
}
fvec4
operator
<
(
fvec4
other
)
const
{
return
_mm_cmplt_ps
(
val
,
other
);
}
fvec4
operator
>=
(
fvec4
other
)
const
{
return
_mm_cmpge_ps
(
val
,
other
);
}
fvec4
operator
<=
(
fvec4
other
)
const
{
return
_mm_cmple_ps
(
val
,
other
);
}
operator
ivec4
()
const
;
};
/**
* A four element vector of ints.
*/
class
ivec4
{
public:
__m128i
val
;
ivec4
()
{}
ivec4
(
int
v
)
:
val
(
_mm_set1_epi32
(
v
))
{}
ivec4
(
int
v1
,
int
v2
,
int
v3
,
int
v4
)
:
val
(
_mm_set_epi32
(
v4
,
v3
,
v2
,
v1
))
{}
ivec4
(
__m128i
v
)
:
val
(
v
)
{}
ivec4
(
const
int
*
v
)
:
val
(
_mm_loadu_si128
((
const
__m128i
*
)
v
))
{}
operator
__m128i
()
const
{
return
val
;
}
int
operator
[](
int
i
)
const
{
return
_mm_extract_epi32
(
val
,
i
);
}
void
store
(
int
*
v
)
const
{
_mm_storeu_si128
((
__m128i
*
)
v
,
val
);
}
ivec4
operator
+
(
ivec4
other
)
const
{
return
_mm_add_epi32
(
val
,
other
);
}
ivec4
operator
-
(
ivec4
other
)
const
{
return
_mm_sub_epi32
(
val
,
other
);
}
ivec4
operator
*
(
ivec4
other
)
const
{
return
_mm_mul_epi32
(
val
,
other
);
}
void
operator
+=
(
ivec4
other
)
{
val
=
_mm_add_epi32
(
val
,
other
);
}
void
operator
-=
(
ivec4
other
)
{
val
=
_mm_sub_epi32
(
val
,
other
);
}
void
operator
*=
(
ivec4
other
)
{
val
=
_mm_mul_epi32
(
val
,
other
);
}
ivec4
operator
-
()
const
{
return
_mm_sub_epi32
(
_mm_set1_epi32
(
0
),
val
);
}
ivec4
operator
&
(
ivec4
other
)
const
{
return
_mm_and_si128
(
val
,
other
);
}
ivec4
operator
==
(
ivec4
other
)
const
{
return
_mm_cmpeq_epi32
(
val
,
other
);
}
operator
fvec4
()
const
;
};
// Conversion operators.
inline
fvec4
::
operator
ivec4
()
const
{
return
_mm_cvttps_epi32
(
val
);
}
inline
ivec4
::
operator
fvec4
()
const
{
return
_mm_cvtepi32_ps
(
val
);
}
// Functions that operate on fvec4s.
static
inline
fvec4
floor
(
fvec4
v
)
{
return
fvec4
(
_mm_floor_ps
(
v
.
val
));
}
static
inline
fvec4
ceil
(
fvec4
v
)
{
return
fvec4
(
_mm_ceil_ps
(
v
.
val
));
}
static
inline
fvec4
round
(
fvec4
v
)
{
return
fvec4
(
_mm_round_ps
(
v
.
val
,
_MM_FROUND_TO_NEAREST_INT
));
}
static
inline
fvec4
min
(
fvec4
v1
,
fvec4
v2
)
{
return
fvec4
(
_mm_min_ps
(
v1
.
val
,
v2
.
val
));
}
static
inline
fvec4
max
(
fvec4
v1
,
fvec4
v2
)
{
return
fvec4
(
_mm_max_ps
(
v1
.
val
,
v2
.
val
));
}
static
inline
fvec4
abs
(
fvec4
v
)
{
static
const
__m128
mask
=
_mm_castsi128_ps
(
_mm_set1_epi32
(
0x7FFFFFFF
));
return
fvec4
(
_mm_and_ps
(
v
.
val
,
mask
));
}
static
inline
fvec4
sqrt
(
fvec4
v
)
{
return
fvec4
(
_mm_sqrt_ps
(
v
.
val
));
}
static
inline
float
dot3
(
fvec4
v1
,
fvec4
v2
)
{
return
_mm_cvtss_f32
(
_mm_dp_ps
(
v1
,
v2
,
0x71
));
}
static
inline
float
dot4
(
fvec4
v1
,
fvec4
v2
)
{
return
_mm_cvtss_f32
(
_mm_dp_ps
(
v1
,
v2
,
0xF1
));
}
// Functions that operate on ivec4s.
static
inline
ivec4
min
(
ivec4
v1
,
ivec4
v2
)
{
return
ivec4
(
_mm_min_epi32
(
v1
.
val
,
v2
.
val
));
}
static
inline
ivec4
max
(
ivec4
v1
,
ivec4
v2
)
{
return
ivec4
(
_mm_max_epi32
(
v1
.
val
,
v2
.
val
));
}
static
inline
ivec4
abs
(
ivec4
v
)
{
return
ivec4
(
_mm_abs_epi32
(
v
.
val
));
}
// Mathematical operators involving a scalar and a vector.
static
inline
fvec4
operator
+
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
+
v2
;
}
static
inline
fvec4
operator
-
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
-
v2
;
}
static
inline
fvec4
operator
*
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
*
v2
;
}
static
inline
fvec4
operator
/
(
float
v1
,
fvec4
v2
)
{
return
fvec4
(
v1
)
/
v2
;
}
#endif
/*OPENMM_VECTORIZE_H_*/
platforms/cpu/include/CpuNeighborList.h
View file @
79fd69c3
...
...
@@ -12,26 +12,42 @@ namespace OpenMM {
class
OPENMM_EXPORT_CPU
CpuNeighborList
{
public:
class
ThreadData
;
class
VoxelHash
;
class
Voxels
;
static
const
int
BlockSize
;
CpuNeighborList
();
~
CpuNeighborList
();
void
computeNeighborList
(
int
numAtoms
,
const
std
::
vector
<
float
>&
atomLocations
,
const
std
::
vector
<
std
::
set
<
int
>
>&
exclusions
,
const
float
*
periodicBoxSize
,
bool
usePeriodic
,
float
maxDistance
);
const
std
::
vector
<
std
::
pair
<
int
,
int
>
>&
getNeighbors
();
int
getNumBlocks
()
const
;
const
std
::
vector
<
int
>&
getSortedAtoms
()
const
;
const
std
::
vector
<
int
>&
getBlockNeighbors
(
int
blockIndex
)
const
;
const
std
::
vector
<
char
>&
getBlockExclusions
(
int
blockIndex
)
const
;
/**
* This routine contains the code executed by each thread.
*/
void
runThread
(
int
index
,
std
::
vector
<
std
::
pair
<
int
,
int
>
>&
threadNeighbors
);
void
runThread
(
int
index
);
private:
/**
* This is called by the worker threads to wait until the master thread instructs them to advance.
*/
void
threadWait
();
/**
* This is called by the master thread to instruct all the worker threads to advance.
*/
void
advanceThreads
();
bool
isDeleted
;
int
numThreads
,
waitCount
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
neighbors
;
std
::
vector
<
int
>
sortedAtoms
;
std
::
vector
<
std
::
vector
<
int
>
>
blockNeighbors
;
std
::
vector
<
std
::
vector
<
char
>
>
blockExclusions
;
std
::
vector
<
pthread_t
>
thread
;
std
::
vector
<
ThreadData
*>
threadData
;
pthread_cond_t
startCondition
,
endCondition
;
pthread_mutex_t
lock
;
// The following variables are used to make information accessible to the individual threads.
VoxelHash
*
voxelHash
;
float
minx
,
maxx
,
miny
,
maxy
,
minz
,
maxz
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
atomBins
;
Voxels
*
voxels
;
const
std
::
vector
<
std
::
set
<
int
>
>*
exclusions
;
const
float
*
atomLocations
;
const
float
*
periodicBoxSize
;
...
...
platforms/cpu/include/CpuNonbondedForce.h
View file @
79fd69c3
...
...
@@ -25,14 +25,17 @@
#ifndef OPENMM_CPU_NONBONDED_FORCE_H__
#define OPENMM_CPU_NONBONDED_FORCE_H__
#include "CpuNeighborList.h"
#include "ReferencePairIxn.h"
#include "openmm/internal/vectorize.h"
#include <pthread.h>
#include <set>
#include <utility>
#include <vector>
#include <smmintrin.h>
// ---------------------------------------------------------------------------------------
namespace
OpenMM
{
class
CpuNonbondedForce
{
public:
class
ThreadData
;
...
...
@@ -63,7 +66,7 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */
void
setUseCutoff
(
float
distance
,
const
std
::
vector
<
std
::
pair
<
int
,
int
>
>
&
neighbors
,
float
solventDielectric
);
void
setUseCutoff
(
float
distance
,
const
CpuNeighborList
&
neighbors
,
float
solventDielectric
);
/**---------------------------------------------------------------------------------------
...
...
@@ -127,9 +130,9 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */
void
calculateReciprocalIxn
(
int
numberOfAtoms
,
float
*
posq
,
std
::
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
void
calculateReciprocalIxn
(
int
numberOfAtoms
,
float
*
posq
,
std
::
vector
<
RealVec
>&
atomCoordinates
,
const
std
::
vector
<
std
::
pair
<
float
,
float
>
>&
atomParameters
,
const
std
::
vector
<
std
::
set
<
int
>
>&
exclusions
,
std
::
vector
<
OpenMM
::
RealVec
>&
forces
,
float
*
totalEnergy
)
const
;
std
::
vector
<
RealVec
>&
forces
,
float
*
totalEnergy
)
const
;
/**---------------------------------------------------------------------------------------
...
...
@@ -159,14 +162,14 @@ private:
bool
periodic
;
bool
ewald
;
bool
pme
;
const
std
::
vector
<
std
::
pair
<
int
,
int
>
>
*
neighborList
;
const
CpuNeighborList
*
neighborList
;
float
periodicBoxSize
[
3
];
float
cutoffDistance
,
switchingDistance
;
float
krf
,
crf
;
float
alphaEwald
;
int
numRx
,
numRy
,
numRz
;
int
meshDim
[
3
];
std
::
vector
<
float
>
ewaldScale
X
,
ewaldScaleY
,
ewaldScaleDeriv
;
std
::
vector
<
float
>
ewaldScale
Table
;
float
ewaldDX
,
ewaldDXInv
;
bool
isDeleted
;
int
numThreads
,
waitCount
;
...
...
@@ -195,31 +198,42 @@ private:
--------------------------------------------------------------------------------------- */
void
calculateOneIxn
(
int
atom1
,
int
atom2
,
float
*
forces
,
double
*
totalEnergy
,
const
__m128
&
boxSize
,
const
__m128
&
invBoxSize
);
void
calculateOneIxn
(
int
atom1
,
int
atom2
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
);
/**---------------------------------------------------------------------------------------
Calculate
LJ Coulomb pair ixn between two atoms
Calculate
all the interactions for one atom block.
@param atom1 the index of the first atom
@param atom2 the index of the second atom
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void
calculateBlockIxn
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
);
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void
calculate
One
EwaldIxn
(
int
atom1
,
int
atom2
,
float
*
forces
,
double
*
totalEnergy
,
const
__m128
&
boxSize
,
const
__m128
&
invBoxSize
);
void
calculate
Block
EwaldIxn
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
);
/**
* Compute the displacement and squared distance between two points, optionally using
* periodic boundary conditions.
*/
void
getDeltaR
(
const
__m128
&
posI
,
const
__m128
&
posJ
,
__m128
&
deltaR
,
float
&
r2
,
bool
periodic
,
const
__m128
&
boxSize
,
const
__m128
&
invBoxSize
)
const
;
void
getDeltaR
(
const
fvec4
&
posI
,
const
fvec4
&
posJ
,
fvec4
&
deltaR
,
float
&
r2
,
bool
periodic
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
const
;
/**
* Compute a fast approximation to erfc(x).
*/
static
f
loat
erfcApprox
(
f
loat
x
);
static
f
vec4
erfcApprox
(
f
vec4
x
);
/**
* Create a lookup table for the scale factor used with Ewald and PME.
...
...
@@ -229,9 +243,11 @@ private:
/**
* Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
*/
f
loat
ewaldScaleFunction
(
f
loat
x
);
f
vec4
ewaldScaleFunction
(
f
vec4
x
);
};
}
// namespace OpenMM
// ---------------------------------------------------------------------------------------
#endif // OPENMM_CPU_NONBONDED_FORCE_H__
platforms/cpu/src/CpuKernels.cpp
View file @
79fd69c3
...
...
@@ -221,20 +221,48 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
if
(
nonbondedMethod
!=
NoCutoff
)
{
// Determine whether we need to recompute the neighbor list.
double
padding
=
0.1
*
nonbondedCutoff
;
double
padding
=
0.1
5
*
nonbondedCutoff
;
bool
needRecompute
=
false
;
double
closeCutoff2
=
0.25
*
padding
*
padding
;
double
farCutoff2
=
0.5
*
padding
*
padding
;
int
maxNumMoved
=
numParticles
/
10
;
vector
<
int
>
moved
;
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
{
RealVec
delta
=
posData
[
i
]
-
lastPositions
[
i
];
if
(
delta
.
dot
(
delta
)
>
0.25
*
padding
*
padding
)
{
needRecompute
=
true
;
break
;
double
dist2
=
delta
.
dot
(
delta
);
if
(
dist2
>
closeCutoff2
)
{
moved
.
push_back
(
i
);
if
(
dist2
>
farCutoff2
||
moved
.
size
()
>
maxNumMoved
)
{
needRecompute
=
true
;
break
;
}
}
}
if
(
!
needRecompute
&&
moved
.
size
()
>
0
)
{
// Some particles have moved further than half the padding distance. Look for pairs
// that are missing from the neighbor list.
int
numMoved
=
moved
.
size
();
double
cutoff2
=
nonbondedCutoff
*
nonbondedCutoff
;
for
(
int
i
=
1
;
i
<
numMoved
&&
!
needRecompute
;
i
++
)
for
(
int
j
=
0
;
j
<
i
;
j
++
)
{
RealVec
delta
=
posData
[
moved
[
i
]]
-
posData
[
moved
[
j
]];
if
(
delta
.
dot
(
delta
)
<
cutoff2
)
{
// These particles should interact. See if they are in the neighbor list.
RealVec
oldDelta
=
lastPositions
[
moved
[
i
]]
-
lastPositions
[
moved
[
j
]];
if
(
oldDelta
.
dot
(
oldDelta
)
>
cutoff2
)
{
needRecompute
=
true
;
break
;
}
}
}
}
if
(
needRecompute
)
{
neighborList
.
computeNeighborList
(
numParticles
,
posq
,
exclusions
,
floatBoxSize
,
periodic
||
ewald
||
pme
,
nonbondedCutoff
+
padding
);
lastPositions
=
posData
;
}
nonbonded
.
setUseCutoff
(
nonbondedCutoff
,
neighborList
.
getNeighbors
()
,
rfDielectric
);
nonbonded
.
setUseCutoff
(
nonbondedCutoff
,
neighborList
,
rfDielectric
);
}
if
(
periodic
||
ewald
||
pme
)
{
double
minAllowedSize
=
1.999999
*
nonbondedCutoff
;
...
...
platforms/cpu/src/CpuNeighborList.cpp
View file @
79fd69c3
#include "CpuNeighborList.h"
#include "openmm/internal/hardware.h"
#include "openmm/internal/vectorize.h"
#include "hilbert.h"
#include <algorithm>
#include <set>
#include <map>
#include <cmath>
...
...
@@ -9,137 +12,236 @@ using namespace std;
namespace
OpenMM
{
const
int
CpuNeighborList
::
BlockSize
=
4
;
class
VoxelIndex
{
public:
VoxelIndex
(
int
xx
,
int
yy
,
int
zz
)
:
x
(
xx
),
y
(
yy
),
z
(
zz
)
{
}
// operator<() needed for map
bool
operator
<
(
const
VoxelIndex
&
other
)
const
{
if
(
x
<
other
.
x
)
return
true
;
else
if
(
x
>
other
.
x
)
return
false
;
else
if
(
y
<
other
.
y
)
return
true
;
else
if
(
y
>
other
.
y
)
return
false
;
else
if
(
z
<
other
.
z
)
return
true
;
else
return
false
;
VoxelIndex
(
int
x
,
int
y
)
:
x
(
x
),
y
(
y
)
{
}
int
x
;
int
y
;
int
z
;
};
typedef
pair
<
const
float
*
,
int
>
VoxelItem
;
typedef
vector
<
VoxelItem
>
Voxel
;
class
CpuNeighborList
::
VoxelHash
{
/**
* This data structure organizes the particles spatially. It divides them into bins along the x and y axes,
* then sorts each bin along the z axis so ranges can be identified quickly with a binary search.
*/
class
CpuNeighborList
::
Voxels
{
public:
Voxel
Hash
(
float
vsx
,
float
vsy
,
float
vsz
,
const
float
*
periodicBoxSize
,
bool
usePeriodic
)
:
voxelSizeX
(
vsx
),
voxelSizeY
(
vsy
),
voxelSizeZ
(
vsz
),
periodicBoxSize
(
periodicBoxSize
),
usePeriodic
(
usePeriodic
)
{
Voxel
s
(
float
vsx
,
float
vsy
,
float
minx
,
float
maxx
,
float
miny
,
float
maxy
,
const
float
*
periodicBoxSize
,
bool
usePeriodic
)
:
voxelSizeX
(
vsx
),
voxelSizeY
(
vsy
),
minx
(
minx
),
maxx
(
maxx
),
miny
(
miny
),
maxy
(
maxy
),
periodicBoxSize
(
periodicBoxSize
),
usePeriodic
(
usePeriodic
)
{
if
(
usePeriodic
)
{
nx
=
(
int
)
floorf
(
periodicBoxSize
[
0
]
/
voxelSizeX
+
0.5
f
);
ny
=
(
int
)
floorf
(
periodicBoxSize
[
1
]
/
voxelSizeY
+
0.5
f
);
nz
=
(
int
)
floorf
(
periodicBoxSize
[
2
]
/
voxelSizeZ
+
0.5
f
);
voxelSizeX
=
periodicBoxSize
[
0
]
/
nx
;
voxelSizeY
=
periodicBoxSize
[
1
]
/
ny
;
voxelSizeZ
=
periodicBoxSize
[
2
]
/
nz
;
}
else
{
nx
=
max
(
1
,
(
int
)
floorf
((
maxx
-
minx
)
/
voxelSizeX
+
0.5
f
));
ny
=
max
(
1
,
(
int
)
floorf
((
maxy
-
miny
)
/
voxelSizeY
+
0.5
f
));
if
(
maxx
>
minx
)
voxelSizeX
=
(
maxx
-
minx
)
/
nx
;
if
(
maxy
>
miny
)
voxelSizeY
=
(
maxy
-
miny
)
/
ny
;
}
bins
.
resize
(
nx
);
for
(
int
i
=
0
;
i
<
nx
;
i
++
)
{
bins
[
i
].
resize
(
ny
);
for
(
int
j
=
0
;
j
<
ny
;
j
++
)
bins
[
i
][
j
].
resize
(
0
);
}
}
void
insert
(
const
int
&
item
,
const
float
*
location
)
{
/**
* Insert a particle into the voxel data structure.
*/
void
insert
(
const
int
&
atom
,
const
float
*
location
)
{
VoxelIndex
voxelIndex
=
getVoxelIndex
(
location
);
if
(
voxelMap
.
find
(
voxelIndex
)
==
voxelMap
.
end
())
voxelMap
[
voxelIndex
]
=
Voxel
();
Voxel
&
voxel
=
voxelMap
.
find
(
voxelIndex
)
->
second
;
voxel
.
push_back
(
VoxelItem
(
location
,
item
));
bins
[
voxelIndex
.
x
][
voxelIndex
.
y
].
push_back
(
make_pair
(
location
[
2
],
VoxelItem
(
location
,
atom
)));
}
/**
* Sort the particles in each voxel by z coordinate.
*/
void
sortItems
()
{
for
(
int
i
=
0
;
i
<
nx
;
i
++
)
for
(
int
j
=
0
;
j
<
ny
;
j
++
)
sort
(
bins
[
i
][
j
].
begin
(),
bins
[
i
][
j
].
end
());
}
/**
* Find the index of the first particle in voxel (x,y) whose z coordinate in >= the specified value.
*/
int
findLowerBound
(
int
x
,
int
y
,
double
z
)
const
{
const
vector
<
pair
<
float
,
VoxelItem
>
>&
bin
=
bins
[
x
][
y
];
int
lower
=
0
;
int
upper
=
bin
.
size
();
while
(
lower
<
upper
)
{
int
middle
=
(
lower
+
upper
)
/
2
;
if
(
bin
[
middle
].
first
<
z
)
lower
=
middle
+
1
;
else
upper
=
middle
;
}
return
lower
;
}
/**
* Find the index of the first particle in voxel (x,y) whose z coordinate in greater than the specified value.
*/
int
findUpperBound
(
int
x
,
int
y
,
double
z
)
const
{
const
vector
<
pair
<
float
,
VoxelItem
>
>&
bin
=
bins
[
x
][
y
];
int
lower
=
0
;
int
upper
=
bin
.
size
();
while
(
lower
<
upper
)
{
int
middle
=
(
lower
+
upper
)
/
2
;
if
(
bin
[
middle
].
first
>
z
)
upper
=
middle
;
else
lower
=
middle
+
1
;
}
return
upper
;
}
/**
* Get the voxel index containing a particular location.
*/
VoxelIndex
getVoxelIndex
(
const
float
*
location
)
const
{
float
xperiodic
,
yperiodic
,
zperiodic
;
float
xperiodic
,
yperiodic
;
if
(
!
usePeriodic
)
{
xperiodic
=
location
[
0
];
yperiodic
=
location
[
1
];
zperiodic
=
location
[
2
];
xperiodic
=
location
[
0
]
-
minx
;
yperiodic
=
location
[
1
]
-
miny
;
}
else
{
xperiodic
=
location
[
0
]
-
periodicBoxSize
[
0
]
*
floorf
(
location
[
0
]
/
periodicBoxSize
[
0
]);
yperiodic
=
location
[
1
]
-
periodicBoxSize
[
1
]
*
floorf
(
location
[
1
]
/
periodicBoxSize
[
1
]);
zperiodic
=
location
[
2
]
-
periodicBoxSize
[
2
]
*
floorf
(
location
[
2
]
/
periodicBoxSize
[
2
]);
}
int
x
=
int
(
floorf
(
xperiodic
/
voxelSizeX
));
int
y
=
int
(
floorf
(
yperiodic
/
voxelSizeY
));
int
z
=
int
(
floorf
(
zperiodic
/
voxelSizeZ
));
int
x
=
min
(
nx
-
1
,
int
(
floorf
(
xperiodic
/
voxelSizeX
)));
int
y
=
min
(
ny
-
1
,
int
(
floorf
(
yperiodic
/
voxelSizeY
)));
return
VoxelIndex
(
x
,
y
,
z
);
return
VoxelIndex
(
x
,
y
);
}
void
getNeighbors
(
vector
<
pair
<
int
,
int
>
>&
neighbors
,
const
VoxelItem
&
referencePoint
,
const
vector
<
set
<
int
>
>&
exclusions
,
float
maxDistance
)
const
{
// Loop over neighboring voxels
// TODO use more clever selection of neighboring voxels
const
int
atomI
=
referencePoint
.
second
;
const
float
*
locationI
=
referencePoint
.
first
;
__m128
posI
=
_mm_loadu_ps
(
locationI
);
__m128
boxSize
=
_mm_set_ps
(
0
,
periodicBoxSize
[
2
],
periodicBoxSize
[
1
],
periodicBoxSize
[
0
]);
__m128
invBoxSize
=
_mm_set_ps
(
0
,
(
1
/
periodicBoxSize
[
2
]),
(
1
/
periodicBoxSize
[
1
]),
(
1
/
periodicBoxSize
[
0
]));
__m128
half
=
_mm_set1_ps
(
0.5
);
void
getNeighbors
(
vector
<
int
>&
neighbors
,
int
blockIndex
,
fvec4
blockCenter
,
fvec4
blockWidth
,
const
vector
<
int
>&
sortedAtoms
,
vector
<
char
>&
exclusions
,
float
maxDistance
,
const
vector
<
int
>
blockAtoms
,
const
float
*
atomLocations
)
const
{
neighbors
.
resize
(
0
);
exclusions
.
resize
(
0
);
fvec4
boxSize
(
periodicBoxSize
[
0
],
periodicBoxSize
[
1
],
periodicBoxSize
[
2
],
0
);
fvec4
invBoxSize
(
1
/
periodicBoxSize
[
0
],
1
/
periodicBoxSize
[
1
],
1
/
periodicBoxSize
[
2
],
0
);
float
maxDistanceSquared
=
maxDistance
*
maxDistance
;
float
refineCutoff
=
maxDistance
-
max
(
max
(
blockWidth
[
0
],
blockWidth
[
1
]),
blockWidth
[
2
]);
float
refineCutoffSquared
=
refineCutoff
*
refineCutoff
;
int
dIndexX
=
int
(
maxDistance
/
voxelSizeX
)
+
1
;
// How may voxels away do we have to look?
int
dIndexY
=
int
(
maxDistance
/
voxelSizeY
)
+
1
;
int
dIndexZ
=
int
(
maxDistance
/
voxelSizeZ
)
+
1
;
VoxelIndex
centerVoxelIndex
=
getVoxelIndex
(
locationI
);
int
lastx
=
centerVoxelIndex
.
x
+
dIndexX
;
int
lasty
=
centerVoxelIndex
.
y
+
dIndexY
;
int
lastz
=
centerVoxelIndex
.
z
+
dIndexZ
;
int
dIndexX
=
min
(
nx
/
2
,
int
((
maxDistance
+
blockWidth
[
0
])
/
voxelSizeX
)
+
1
);
// How may voxels away do we have to look?
int
dIndexY
=
min
(
ny
/
2
,
int
((
maxDistance
+
blockWidth
[
1
])
/
voxelSizeY
)
+
1
);
float
centerPos
[
4
];
blockCenter
.
store
(
centerPos
);
VoxelIndex
centerVoxelIndex
=
getVoxelIndex
(
centerPos
);
int
startx
=
centerVoxelIndex
.
x
-
dIndexX
;
int
starty
=
centerVoxelIndex
.
y
-
dIndexY
;
int
endx
=
centerVoxelIndex
.
x
+
dIndexX
;
int
endy
=
centerVoxelIndex
.
y
+
dIndexY
;
int
numRanges
;
if
(
usePeriodic
)
{
last
x
=
min
(
last
x
,
centerVoxelIndex
.
x
-
dIndexX
+
nx
-
1
);
last
y
=
min
(
last
y
,
centerVoxelIndex
.
y
-
dIndexY
+
ny
-
1
);
lastz
=
min
(
lastz
,
centerVoxelIndex
.
z
-
dIndexZ
+
nz
-
1
)
;
end
x
=
min
(
end
x
,
centerVoxelIndex
.
x
-
dIndexX
+
nx
-
1
);
end
y
=
min
(
end
y
,
centerVoxelIndex
.
y
-
dIndexY
+
ny
-
1
);
numRanges
=
2
;
}
VoxelIndex
voxelIndex
(
0
,
0
,
0
);
for
(
int
x
=
centerVoxelIndex
.
x
-
dIndexX
;
x
<=
lastx
;
++
x
)
{
else
{
startx
=
max
(
startx
,
0
);
starty
=
max
(
starty
,
0
);
endx
=
min
(
endx
,
nx
-
1
);
endy
=
min
(
endy
,
ny
-
1
);
numRanges
=
1
;
}
int
lastSortedIndex
=
BlockSize
*
(
blockIndex
+
1
);
VoxelIndex
voxelIndex
(
0
,
0
);
for
(
int
x
=
startx
;
x
<=
endx
;
++
x
)
{
voxelIndex
.
x
=
x
;
if
(
usePeriodic
)
voxelIndex
.
x
=
(
x
<
0
?
x
+
nx
:
(
x
>=
nx
?
x
-
nx
:
x
));
for
(
int
y
=
centerVoxelIndex
.
y
-
dIndexY
;
y
<=
lasty
;
++
y
)
{
float
dx
=
max
(
0.0
f
,
voxelSizeX
*
max
(
0
,
abs
(
centerVoxelIndex
.
x
-
x
)
-
1
)
-
blockWidth
[
0
]);
for
(
int
y
=
starty
;
y
<=
endy
;
++
y
)
{
voxelIndex
.
y
=
y
;
if
(
usePeriodic
)
voxelIndex
.
y
=
(
y
<
0
?
y
+
ny
:
(
y
>=
ny
?
y
-
ny
:
y
));
for
(
int
z
=
centerVoxelIndex
.
z
-
dIndexZ
;
z
<=
lastz
;
++
z
)
{
voxelIndex
.
z
=
z
;
if
(
usePeriodic
)
voxelIndex
.
z
=
(
z
<
0
?
z
+
nz
:
(
z
>=
nz
?
z
-
nz
:
z
));
const
map
<
VoxelIndex
,
Voxel
>::
const_iterator
voxelEntry
=
voxelMap
.
find
(
voxelIndex
);
if
(
voxelEntry
==
voxelMap
.
end
())
continue
;
// no such voxel; skip
const
Voxel
&
voxel
=
voxelEntry
->
second
;
for
(
Voxel
::
const_iterator
itemIter
=
voxel
.
begin
();
itemIter
!=
voxel
.
end
();
++
itemIter
)
{
const
int
atomJ
=
itemIter
->
second
;
float
dy
=
max
(
0.0
f
,
voxelSizeY
*
max
(
0
,
abs
(
centerVoxelIndex
.
y
-
y
)
-
1
)
-
blockWidth
[
1
]);
// Identify the range of atoms within this bin we need to search. When using periodic boundary
// conditions, there may be two separate ranges.
float
dz
=
maxDistance
+
blockWidth
[
2
];
dz
=
sqrtf
(
max
(
0.0
f
,
dz
*
dz
-
dx
*
dx
-
dy
*
dy
));
int
rangeStart
[
2
];
int
rangeEnd
[
2
];
rangeStart
[
0
]
=
findLowerBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
-
dz
);
if
(
usePeriodic
)
{
rangeEnd
[
0
]
=
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
+
dz
);
if
(
rangeStart
[
0
]
>
0
)
{
rangeStart
[
1
]
=
0
;
rangeEnd
[
1
]
=
min
(
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
+
dz
-
periodicBoxSize
[
2
]),
rangeStart
[
0
]);
}
else
{
rangeStart
[
1
]
=
max
(
findLowerBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
-
dz
+
periodicBoxSize
[
2
]),
rangeEnd
[
0
]);
rangeEnd
[
1
]
=
bins
[
voxelIndex
.
x
][
voxelIndex
.
y
].
size
();
}
}
else
rangeEnd
[
0
]
=
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
+
dz
);
// Loop over atoms and check to see if they are neighbors of this block.
for
(
int
range
=
0
;
range
<
numRanges
;
range
++
)
{
for
(
int
item
=
rangeStart
[
range
];
item
<
rangeEnd
[
range
];
item
++
)
{
const
int
sortedIndex
=
bins
[
voxelIndex
.
x
][
voxelIndex
.
y
][
item
].
second
.
second
;
// Avoid duplicate entries.
if
(
atomJ
>=
atomI
)
break
;
if
(
sortedIndex
>=
lastSortedIndex
)
continue
;
__m128
posJ
=
_mm_loadu_ps
(
itemIter
->
first
);
__m128
delta
=
_mm_sub_ps
(
posJ
,
posI
)
;
fvec4
atomPos
(
bins
[
voxelIndex
.
x
][
voxelIndex
.
y
][
item
].
second
.
first
);
fvec4
delta
=
atomPos
-
centerPos
;
if
(
usePeriodic
)
{
__m128
base
=
_mm_mul_ps
(
_mm_floor_ps
(
_mm_add_ps
(
_mm_mul_ps
(
delta
,
invBoxSize
)
,
half
)),
boxSize
)
;
delta
=
_mm_sub_ps
(
delta
,
base
)
;
fvec4
base
=
round
(
delta
*
invBoxSize
)
*
boxSize
;
delta
=
delta
-
base
;
}
float
dSquared
=
_mm_cvtss_f32
(
_mm_dp_ps
(
delta
,
delta
,
0x71
));
delta
=
max
(
0.0
f
,
abs
(
delta
)
-
blockWidth
);
float
dSquared
=
dot3
(
delta
,
delta
);
if
(
dSquared
>
maxDistanceSquared
)
continue
;
// Ignore exclusions.
if
(
exclusions
[
atomI
].
find
(
atomJ
)
!=
exclusions
[
atomI
].
end
())
continue
;
neighbors
.
push_back
(
make_pair
(
atomI
,
atomJ
));
if
(
dSquared
>
refineCutoffSquared
)
{
// The distance is large enough that there might not be any actual interactions.
// Check individual atom pairs to be sure.
bool
any
=
false
;
for
(
int
k
=
0
;
k
<
(
int
)
blockAtoms
.
size
();
k
++
)
{
fvec4
pos1
(
&
atomLocations
[
4
*
blockAtoms
[
k
]]);
delta
=
atomPos
-
pos1
;
if
(
usePeriodic
)
{
fvec4
base
=
round
(
delta
*
invBoxSize
)
*
boxSize
;
delta
=
delta
-
base
;
}
float
r2
=
dot3
(
delta
,
delta
);
if
(
r2
<
maxDistanceSquared
)
{
any
=
true
;
break
;
}
}
if
(
!
any
)
continue
;
}
// Add this atom to the list of neighbors.
neighbors
.
push_back
(
sortedAtoms
[
sortedIndex
]);
if
(
sortedIndex
<
BlockSize
*
blockIndex
)
exclusions
.
push_back
(
0
);
else
exclusions
.
push_back
(
0xF
&
(
0xF
<<
(
sortedIndex
-
BlockSize
*
blockIndex
)));
}
}
}
...
...
@@ -147,11 +249,12 @@ public:
}
private:
float
voxelSizeX
,
voxelSizeY
,
voxelSizeZ
;
int
nx
,
ny
,
nz
;
float
voxelSizeX
,
voxelSizeY
;
float
minx
,
maxx
,
miny
,
maxy
;
int
nx
,
ny
;
const
float
*
periodicBoxSize
;
const
bool
usePeriodic
;
map
<
VoxelIndex
,
Voxel
>
voxelMap
;
vector
<
vector
<
vector
<
pair
<
float
,
VoxelItem
>
>
>
>
bins
;
};
class
CpuNeighborList
::
ThreadData
{
...
...
@@ -160,12 +263,11 @@ public:
}
int
index
;
CpuNeighborList
&
owner
;
vector
<
pair
<
int
,
int
>
>
threadNeighbors
;
};
static
void
*
threadBody
(
void
*
args
)
{
CpuNeighborList
::
ThreadData
&
data
=
*
reinterpret_cast
<
CpuNeighborList
::
ThreadData
*>
(
args
);
data
.
owner
.
runThread
(
data
.
index
,
data
.
threadNeighbors
);
data
.
owner
.
runThread
(
data
.
index
);
delete
&
data
;
return
0
;
}
...
...
@@ -203,23 +305,13 @@ CpuNeighborList::~CpuNeighborList() {
void
CpuNeighborList
::
computeNeighborList
(
int
numAtoms
,
const
vector
<
float
>&
atomLocations
,
const
vector
<
set
<
int
>
>&
exclusions
,
const
float
*
periodicBoxSize
,
bool
usePeriodic
,
float
maxDistance
)
{
// Build the voxel hash.
float
edgeSizeX
,
edgeSizeY
,
edgeSizeZ
;
if
(
!
usePeriodic
)
edgeSizeX
=
edgeSizeY
=
edgeSizeZ
=
maxDistance
;
// TODO - adjust this as needed
else
{
edgeSizeX
=
0.5
f
*
periodicBoxSize
[
0
]
/
floorf
(
periodicBoxSize
[
0
]
/
maxDistance
);
edgeSizeY
=
0.5
f
*
periodicBoxSize
[
1
]
/
floorf
(
periodicBoxSize
[
1
]
/
maxDistance
);
edgeSizeZ
=
0.5
f
*
periodicBoxSize
[
2
]
/
floorf
(
periodicBoxSize
[
2
]
/
maxDistance
);
}
VoxelHash
voxelHash
(
edgeSizeX
,
edgeSizeY
,
edgeSizeZ
,
periodicBoxSize
,
usePeriodic
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
voxelHash
.
insert
(
i
,
&
atomLocations
[
4
*
i
]);
int
numBlocks
=
(
numAtoms
+
BlockSize
-
1
)
/
BlockSize
;
blockNeighbors
.
resize
(
numBlocks
);
blockExclusions
.
resize
(
numBlocks
);
sortedAtoms
.
resize
(
numAtoms
);
// Record the parameters for the threads.
this
->
voxelHash
=
&
voxelHash
;
this
->
exclusions
=
&
exclusions
;
this
->
atomLocations
=
&
atomLocations
[
0
];
this
->
periodicBoxSize
=
periodicBoxSize
;
...
...
@@ -227,43 +319,158 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
this
->
usePeriodic
=
usePeriodic
;
this
->
maxDistance
=
maxDistance
;
// Signal the threads to start running and wait for them to finish.
// Identify the range of atom positions along each axis.
fvec4
minPos
(
&
atomLocations
[
0
]);
fvec4
maxPos
=
minPos
;
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
fvec4
pos
(
&
atomLocations
[
4
*
i
]);
minPos
=
min
(
minPos
,
pos
);
maxPos
=
max
(
maxPos
,
pos
);
}
minx
=
minPos
[
0
];
maxx
=
maxPos
[
0
];
miny
=
minPos
[
1
];
maxy
=
maxPos
[
1
];
minz
=
minPos
[
2
];
maxz
=
maxPos
[
2
];
// Sort the atoms based on a Hilbert curve.
atomBins
.
resize
(
numAtoms
);
pthread_mutex_lock
(
&
lock
);
waitCount
=
0
;
pthread_cond_broadcast
(
&
startCondition
);
while
(
waitCount
<
numThreads
)
pthread_cond_wait
(
&
endCondition
,
&
lock
);
advanceThreads
();
sort
(
atomBins
.
begin
(),
atomBins
.
end
());
// Build the voxel hash.
float
edgeSizeX
,
edgeSizeY
;
if
(
!
usePeriodic
)
edgeSizeX
=
edgeSizeY
=
maxDistance
;
// TODO - adjust this as needed
else
{
edgeSizeX
=
0.5
f
*
periodicBoxSize
[
0
]
/
floorf
(
periodicBoxSize
[
0
]
/
maxDistance
);
edgeSizeY
=
0.5
f
*
periodicBoxSize
[
1
]
/
floorf
(
periodicBoxSize
[
1
]
/
maxDistance
);
}
Voxels
voxels
(
edgeSizeX
,
edgeSizeY
,
minx
,
maxx
,
miny
,
maxy
,
periodicBoxSize
,
usePeriodic
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
int
atomIndex
=
atomBins
[
i
].
second
;
sortedAtoms
[
i
]
=
atomIndex
;
voxels
.
insert
(
i
,
&
atomLocations
[
4
*
atomIndex
]);
}
voxels
.
sortItems
();
this
->
voxels
=
&
voxels
;
// Signal the threads to start running and wait for them to finish.
advanceThreads
();
pthread_mutex_unlock
(
&
lock
);
//
Combine the results from all the threads
.
//
Add padding atoms to fill up the last block
.
neighbors
.
clear
();
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
neighbors
.
insert
(
neighbors
.
end
(),
threadData
[
i
]
->
threadNeighbors
.
begin
(),
threadData
[
i
]
->
threadNeighbors
.
end
());
int
numPadding
=
numBlocks
*
BlockSize
-
numAtoms
;
if
(
numPadding
>
0
)
{
char
mask
=
(
0xF0
>>
numPadding
)
&
0xF
;
for
(
int
i
=
0
;
i
<
numPadding
;
i
++
)
sortedAtoms
.
push_back
(
0
);
vector
<
char
>&
exc
=
blockExclusions
[
blockExclusions
.
size
()
-
1
];
for
(
int
i
=
0
;
i
<
(
int
)
exc
.
size
();
i
++
)
exc
[
i
]
|=
mask
;
}
}
int
CpuNeighborList
::
getNumBlocks
()
const
{
return
sortedAtoms
.
size
()
/
BlockSize
;
}
const
vector
<
pair
<
int
,
int
>
>&
CpuNeighborList
::
getNeighbors
()
{
return
neighbors
;
const
std
::
vector
<
int
>&
CpuNeighborList
::
getSortedAtoms
()
const
{
return
sortedAtoms
;
}
const
std
::
vector
<
int
>&
CpuNeighborList
::
getBlockNeighbors
(
int
blockIndex
)
const
{
return
blockNeighbors
[
blockIndex
];
}
const
std
::
vector
<
char
>&
CpuNeighborList
::
getBlockExclusions
(
int
blockIndex
)
const
{
return
blockExclusions
[
blockIndex
];
}
void
CpuNeighborList
::
runThread
(
int
index
,
vector
<
pair
<
int
,
int
>
>&
threadNeighbors
)
{
void
CpuNeighborList
::
runThread
(
int
index
)
{
while
(
true
)
{
// Wait for the signal to start running.
pthread_mutex_lock
(
&
lock
);
waitCount
++
;
pthread_cond_signal
(
&
endCondition
);
pthread_cond_wait
(
&
startCondition
,
&
lock
);
pthread_mutex_unlock
(
&
lock
);
threadWait
();
if
(
isDeleted
)
break
;
// Compute the positions of atoms along the Hilbert curve.
float
binWidth
=
max
(
max
(
maxx
-
minx
,
maxy
-
miny
),
maxz
-
minz
)
/
255.0
f
;
float
invBinWidth
=
1.0
f
/
binWidth
;
bitmask_t
coords
[
3
];
for
(
int
i
=
index
;
i
<
numAtoms
;
i
+=
numThreads
)
{
const
float
*
pos
=
&
atomLocations
[
4
*
i
];
coords
[
0
]
=
(
bitmask_t
)
((
pos
[
0
]
-
minx
)
*
invBinWidth
);
coords
[
1
]
=
(
bitmask_t
)
((
pos
[
1
]
-
miny
)
*
invBinWidth
);
coords
[
2
]
=
(
bitmask_t
)
((
pos
[
2
]
-
minz
)
*
invBinWidth
);
int
bin
=
(
int
)
hilbert_c2i
(
3
,
8
,
coords
);
atomBins
[
i
]
=
pair
<
int
,
int
>
(
bin
,
i
);
}
threadWait
();
// Compute this thread's subset of neighbors.
threadNeighbors
.
clear
();
for
(
int
i
=
index
;
i
<
numAtoms
;
i
+=
numThreads
)
voxelHash
->
getNeighbors
(
threadNeighbors
,
VoxelItem
(
&
atomLocations
[
4
*
i
],
i
),
*
exclusions
,
maxDistance
);
int
numBlocks
=
blockNeighbors
.
size
();
vector
<
int
>
blockAtoms
;
for
(
int
i
=
index
;
i
<
numBlocks
;
i
+=
numThreads
)
{
{
int
firstIndex
=
BlockSize
*
i
;
int
atomsInBlock
=
min
(
BlockSize
,
numAtoms
-
firstIndex
);
blockAtoms
.
resize
(
atomsInBlock
);
for
(
int
j
=
0
;
j
<
atomsInBlock
;
j
++
)
blockAtoms
[
j
]
=
sortedAtoms
[
firstIndex
+
j
];
}
int
firstIndex
=
BlockSize
*
i
;
fvec4
minPos
(
&
atomLocations
[
4
*
sortedAtoms
[
firstIndex
]]);
fvec4
maxPos
=
minPos
;
int
atomsInBlock
=
min
(
BlockSize
,
numAtoms
-
firstIndex
);
for
(
int
j
=
1
;
j
<
atomsInBlock
;
j
++
)
{
fvec4
pos
(
&
atomLocations
[
4
*
sortedAtoms
[
firstIndex
+
j
]]);
minPos
=
min
(
minPos
,
pos
);
maxPos
=
max
(
maxPos
,
pos
);
}
voxels
->
getNeighbors
(
blockNeighbors
[
i
],
i
,
(
maxPos
+
minPos
)
*
0.5
f
,
(
maxPos
-
minPos
)
*
0.5
f
,
sortedAtoms
,
blockExclusions
[
i
],
maxDistance
,
blockAtoms
,
atomLocations
);
// Record the exclusions for this block.
for
(
int
j
=
0
;
j
<
atomsInBlock
;
j
++
)
{
const
set
<
int
>&
atomExclusions
=
(
*
exclusions
)[
sortedAtoms
[
firstIndex
+
j
]];
char
mask
=
1
<<
j
;
for
(
int
k
=
0
;
k
<
(
int
)
blockNeighbors
[
i
].
size
();
k
++
)
{
int
atomIndex
=
blockNeighbors
[
i
][
k
];
if
(
atomExclusions
.
find
(
atomIndex
)
!=
atomExclusions
.
end
())
blockExclusions
[
i
][
k
]
|=
mask
;
}
}
}
}
}
void
CpuNeighborList
::
threadWait
()
{
pthread_mutex_lock
(
&
lock
);
waitCount
++
;
pthread_cond_signal
(
&
endCondition
);
pthread_cond_wait
(
&
startCondition
,
&
lock
);
pthread_mutex_unlock
(
&
lock
);
}
void
CpuNeighborList
::
advanceThreads
()
{
waitCount
=
0
;
pthread_cond_broadcast
(
&
startCondition
);
while
(
waitCount
<
numThreads
)
{
pthread_cond_wait
(
&
endCondition
,
&
lock
);
}
}
...
...
platforms/cpu/src/CpuNonbondedForce.cpp
View file @
79fd69c3
...
...
@@ -32,6 +32,7 @@
#include "ReferencePME.h"
#include "openmm/internal/hardware.h"
#include "openmm/internal/SplineFitter.h"
#include "openmm/internal/vectorize.h"
// In case we're using some primitive version of Visual Studio this will
// make sure that erf() and erfc() are defined.
...
...
@@ -113,7 +114,7 @@ CpuNonbondedForce::~CpuNonbondedForce(){
--------------------------------------------------------------------------------------- */
void
CpuNonbondedForce
::
setUseCutoff
(
float
distance
,
const
vector
<
pair
<
int
,
int
>
>
&
neighbors
,
float
solventDielectric
)
{
void
CpuNonbondedForce
::
setUseCutoff
(
float
distance
,
const
CpuNeighborList
&
neighbors
,
float
solventDielectric
)
{
cutoff
=
true
;
cutoffDistance
=
distance
;
...
...
@@ -199,23 +200,22 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
void
CpuNonbondedForce
::
tabulateEwaldScaleFactor
()
{
ewaldDX
=
cutoffDistance
/
(
NUM_TABLE_POINTS
-
2
);
ewaldDXInv
=
1.0
f
/
ewaldDX
;
vector
<
double
>
x
(
NUM_TABLE_POINTS
);
vector
<
double
>
y
(
NUM_TABLE_POINTS
);
vector
<
double
>
x
(
NUM_TABLE_POINTS
+
1
);
vector
<
double
>
y
(
NUM_TABLE_POINTS
+
1
);
vector
<
double
>
deriv
;
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
;
i
++
)
{
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
+
1
;
i
++
)
{
double
r
=
i
*
cutoffDistance
/
(
NUM_TABLE_POINTS
-
2
);
double
alphaR
=
alphaEwald
*
r
;
x
[
i
]
=
r
;
y
[
i
]
=
erfc
(
alphaR
)
+
TWO_OVER_SQRT_PI
*
alphaR
*
exp
(
-
alphaR
*
alphaR
);
}
SplineFitter
::
createNaturalSpline
(
x
,
y
,
deriv
);
ewaldScaleX
.
resize
(
NUM_TABLE_POINTS
);
ewaldScaleY
.
resize
(
NUM_TABLE_POINTS
);
ewaldScaleDeriv
.
resize
(
NUM_TABLE_POINTS
);
ewaldScaleTable
.
resize
(
4
*
NUM_TABLE_POINTS
);
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
;
i
++
)
{
ewaldScaleX
[
i
]
=
(
float
)
x
[
i
];
ewaldScaleY
[
i
]
=
(
float
)
y
[
i
];
ewaldScaleDeriv
[
i
]
=
(
float
)
(
deriv
[
i
]
*
ewaldDX
*
ewaldDX
/
6
);
ewaldScaleTable
[
4
*
i
]
=
(
float
)
y
[
i
];
ewaldScaleTable
[
4
*
i
+
1
]
=
(
float
)
y
[
i
+
1
];
ewaldScaleTable
[
4
*
i
+
2
]
=
(
float
)
(
deriv
[
i
]
*
ewaldDX
*
ewaldDX
/
6
);
ewaldScaleTable
[
4
*
i
+
3
]
=
(
float
)
(
deriv
[
i
+
1
]
*
ewaldDX
*
ewaldDX
/
6
);
}
}
...
...
@@ -358,42 +358,12 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
directEnergy
+=
threadData
[
i
]
->
threadEnergy
;
for
(
int
i
=
0
;
i
<
numberOfAtoms
;
i
++
)
{
__m128
f
=
_mm_loadu_ps
(
forces
+
4
*
i
);
fvec4
f
(
forces
+
4
*
i
);
for
(
int
j
=
0
;
j
<
numThreads
;
j
++
)
f
=
_mm_add_ps
(
f
,
_mm_loadu_ps
(
&
threadData
[
j
]
->
threadForce
[
4
*
i
])
)
;
_mm_
store
u_ps
(
forces
+
4
*
i
,
f
);
f
+
=
fvec4
(
&
threadData
[
j
]
->
threadForce
[
4
*
i
]);
f
.
store
(
forces
+
4
*
i
);
}
if
(
ewald
||
pme
)
{
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
__m128
boxSize
=
_mm_set_ps
(
0
,
periodicBoxSize
[
2
],
periodicBoxSize
[
1
],
periodicBoxSize
[
0
]);
__m128
invBoxSize
=
_mm_set_ps
(
0
,
(
1
/
periodicBoxSize
[
2
]),
(
1
/
periodicBoxSize
[
1
]),
(
1
/
periodicBoxSize
[
0
]));
for
(
int
i
=
0
;
i
<
numberOfAtoms
;
i
++
)
for
(
set
<
int
>::
const_iterator
iter
=
exclusions
[
i
].
begin
();
iter
!=
exclusions
[
i
].
end
();
++
iter
)
{
if
(
*
iter
>
i
)
{
int
ii
=
i
;
int
jj
=
*
iter
;
__m128
deltaR
;
__m128
posI
=
_mm_loadu_ps
(
posq
+
4
*
ii
);
__m128
posJ
=
_mm_loadu_ps
(
posq
+
4
*
jj
);
float
r2
;
getDeltaR
(
posJ
,
posI
,
deltaR
,
r2
,
false
,
boxSize
,
invBoxSize
);
float
r
=
sqrtf
(
r2
);
float
inverseR
=
1
/
r
;
float
chargeProd
=
ONE_4PI_EPS0
*
posq
[
4
*
ii
+
3
]
*
posq
[
4
*
jj
+
3
];
float
alphaR
=
alphaEwald
*
r
;
float
erfcAlphaR
=
erfcApprox
(
alphaR
);
float
dEdR
=
(
float
)
(
chargeProd
*
inverseR
*
inverseR
*
inverseR
);
dEdR
=
(
float
)
(
dEdR
*
(
1.0
f
-
erfcAlphaR
-
TWO_OVER_SQRT_PI
*
alphaR
*
exp
(
-
alphaR
*
alphaR
)));
__m128
result
=
_mm_mul_ps
(
deltaR
,
_mm_set1_ps
(
dEdR
));
_mm_storeu_ps
(
forces
+
4
*
ii
,
_mm_sub_ps
(
_mm_loadu_ps
(
forces
+
4
*
ii
),
result
));
_mm_storeu_ps
(
forces
+
4
*
jj
,
_mm_add_ps
(
_mm_loadu_ps
(
forces
+
4
*
jj
),
result
));
if
(
includeEnergy
)
directEnergy
-=
chargeProd
*
inverseR
*
(
1.0
f
-
erfcAlphaR
);
}
}
}
if
(
totalEnergy
!=
NULL
)
*
totalEnergy
+=
(
float
)
directEnergy
;
}
...
...
@@ -418,23 +388,47 @@ void CpuNonbondedForce::runThread(int index, vector<float>& threadForce, double&
threadForce
.
resize
(
4
*
numberOfAtoms
,
0.0
f
);
for
(
int
i
=
0
;
i
<
4
*
numberOfAtoms
;
i
++
)
threadForce
[
i
]
=
0.0
f
;
__m128
boxSize
=
_mm_set_ps
(
0
,
periodicBoxSize
[
2
],
periodicBoxSize
[
1
],
periodicBoxSize
[
0
]
);
__m128
invBoxSize
=
_mm_set_ps
(
0
,
(
1
/
periodicBoxSize
[
2
]),
(
1
/
periodicBoxSize
[
1
]),
(
1
/
periodicBoxSize
[
0
]));
fvec4
boxSize
(
periodicBoxSize
[
0
],
periodicBoxSize
[
1
],
periodicBoxSize
[
2
],
0
);
fvec4
invBoxSize
(
(
1
/
periodicBoxSize
[
0
]),
(
1
/
periodicBoxSize
[
1
]),
(
1
/
periodicBoxSize
[
2
])
,
0
);
if
(
ewald
||
pme
)
{
// Compute the interactions from the neighbor list.
for
(
int
i
=
index
;
i
<
(
int
)
neighborList
->
size
();
i
+=
numThreads
)
{
pair
<
int
,
int
>
pair
=
(
*
neighborList
)[
i
];
calculateOneEwaldIxn
(
pair
.
first
,
pair
.
second
,
&
threadForce
[
0
],
energyPtr
,
boxSize
,
invBoxSize
);
}
for
(
int
i
=
index
;
i
<
neighborList
->
getNumBlocks
();
i
+=
numThreads
)
calculateBlockEwaldIxn
(
i
,
&
threadForce
[
0
],
energyPtr
,
boxSize
,
invBoxSize
);
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
fvec4
boxSize
(
periodicBoxSize
[
0
],
periodicBoxSize
[
1
],
periodicBoxSize
[
2
],
0
);
fvec4
invBoxSize
((
1
/
periodicBoxSize
[
0
]),
(
1
/
periodicBoxSize
[
1
]),
(
1
/
periodicBoxSize
[
2
]),
0
);
for
(
int
i
=
index
;
i
<
numberOfAtoms
;
i
+=
numThreads
)
for
(
set
<
int
>::
const_iterator
iter
=
exclusions
[
i
].
begin
();
iter
!=
exclusions
[
i
].
end
();
++
iter
)
{
if
(
*
iter
>
i
)
{
int
j
=
*
iter
;
fvec4
deltaR
;
fvec4
posI
(
posq
+
4
*
i
);
fvec4
posJ
(
posq
+
4
*
j
);
float
r2
;
getDeltaR
(
posJ
,
posI
,
deltaR
,
r2
,
false
,
boxSize
,
invBoxSize
);
float
r
=
sqrtf
(
r2
);
float
inverseR
=
1
/
r
;
float
chargeProd
=
ONE_4PI_EPS0
*
posq
[
4
*
i
+
3
]
*
posq
[
4
*
j
+
3
];
float
alphaR
=
alphaEwald
*
r
;
float
erfcAlphaR
=
erfcApprox
(
alphaR
)[
0
];
float
dEdR
=
(
float
)
(
chargeProd
*
inverseR
*
inverseR
*
inverseR
);
dEdR
=
(
float
)
(
dEdR
*
(
1.0
f
-
erfcAlphaR
-
TWO_OVER_SQRT_PI
*
alphaR
*
exp
(
-
alphaR
*
alphaR
)));
fvec4
result
=
deltaR
*
dEdR
;
(
fvec4
(
&
threadForce
[
4
*
i
])
-
result
).
store
(
&
threadForce
[
4
*
i
]);
(
fvec4
(
&
threadForce
[
4
*
j
])
+
result
).
store
(
&
threadForce
[
4
*
j
]);
if
(
includeEnergy
)
threadEnergy
-=
chargeProd
*
inverseR
*
(
1.0
f
-
erfcAlphaR
);
}
}
}
else
if
(
cutoff
)
{
// Compute the interactions from the neighbor list.
for
(
int
i
=
index
;
i
<
(
int
)
neighborList
->
size
();
i
+=
numThreads
)
{
pair
<
int
,
int
>
pair
=
(
*
neighborList
)[
i
];
calculateOneIxn
(
pair
.
first
,
pair
.
second
,
&
threadForce
[
0
],
energyPtr
,
boxSize
,
invBoxSize
);
}
for
(
int
i
=
index
;
i
<
neighborList
->
getNumBlocks
();
i
+=
numThreads
)
calculateBlockIxn
(
i
,
&
threadForce
[
0
],
energyPtr
,
boxSize
,
invBoxSize
);
}
else
{
// Loop over all atom pairs
...
...
@@ -448,12 +442,12 @@ void CpuNonbondedForce::runThread(int index, vector<float>& threadForce, double&
}
}
void
CpuNonbondedForce
::
calculateOneIxn
(
int
ii
,
int
jj
,
float
*
forces
,
double
*
totalEnergy
,
const
__m128
&
boxSize
,
const
__m128
&
invBoxSize
)
{
void
CpuNonbondedForce
::
calculateOneIxn
(
int
ii
,
int
jj
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
// get deltaR, R2, and R between 2 atoms
__m128
deltaR
;
__m128
posI
=
_mm_loadu_ps
(
posq
+
4
*
ii
);
__m128
posJ
=
_mm_loadu_ps
(
posq
+
4
*
jj
);
fvec4
deltaR
;
fvec4
posI
(
posq
+
4
*
ii
);
fvec4
posJ
(
posq
+
4
*
jj
);
float
r2
;
getDeltaR
(
posJ
,
posI
,
deltaR
,
r2
,
periodic
,
boxSize
,
invBoxSize
);
if
(
cutoff
&&
r2
>=
cutoffDistance
*
cutoffDistance
)
...
...
@@ -497,83 +491,239 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
// accumulate forces
__m128
result
=
_mm_mul_ps
(
deltaR
,
_mm_set1_ps
(
dEdR
))
;
_mm_storeu_ps
(
forces
+
4
*
ii
,
_mm_add_ps
(
_mm_loadu_ps
(
forces
+
4
*
ii
)
,
result
))
;
_mm_storeu_ps
(
forces
+
4
*
jj
,
_mm_sub_ps
(
_mm_loadu_ps
(
forces
+
4
*
jj
)
,
result
))
;
fvec4
result
=
deltaR
*
dEdR
;
(
fvec4
(
forces
+
4
*
ii
)
+
result
).
store
(
forces
+
4
*
ii
);
(
fvec4
(
forces
+
4
*
jj
)
-
result
).
store
(
forces
+
4
*
jj
);
}
void
CpuNonbondedForce
::
calculateOneEwaldIxn
(
int
ii
,
int
jj
,
float
*
forces
,
double
*
totalEnergy
,
const
__m128
&
boxSize
,
const
__m128
&
invBoxSize
)
{
__m128
deltaR
;
__m128
posI
=
_mm_loadu_ps
(
posq
+
4
*
ii
);
__m128
posJ
=
_mm_loadu_ps
(
posq
+
4
*
jj
);
float
r2
;
getDeltaR
(
posJ
,
posI
,
deltaR
,
r2
,
true
,
boxSize
,
invBoxSize
);
if
(
r2
<
cutoffDistance
*
cutoffDistance
)
{
float
r
=
sqrtf
(
r2
);
float
inverseR
=
1
/
r
;
float
switchValue
=
1
,
switchDeriv
=
0
;
if
(
useSwitch
&&
r
>
switchingDistance
)
{
float
t
=
(
r
-
switchingDistance
)
/
(
cutoffDistance
-
switchingDistance
);
switchValue
=
1
+
t
*
t
*
t
*
(
-
10
+
t
*
(
15
-
t
*
6
));
switchDeriv
=
t
*
t
*
(
-
30
+
t
*
(
60
-
t
*
30
))
/
(
cutoffDistance
-
switchingDistance
);
void
CpuNonbondedForce
::
calculateBlockIxn
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
// Load the positions and parameters of the atoms in the block.
int
blockAtom
[
4
];
fvec4
blockAtomPosq
[
4
];
fvec4
blockAtomForce
[
4
];
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
blockAtom
[
i
]
=
neighborList
->
getSortedAtoms
()[
4
*
blockIndex
+
i
];
blockAtomPosq
[
i
]
=
fvec4
(
posq
+
4
*
blockAtom
[
i
]);
blockAtomForce
[
i
]
=
fvec4
(
0.0
f
);
}
fvec4
blockAtomCharge
=
fvec4
(
ONE_4PI_EPS0
)
*
fvec4
(
blockAtomPosq
[
0
][
3
],
blockAtomPosq
[
1
][
3
],
blockAtomPosq
[
2
][
3
],
blockAtomPosq
[
3
][
3
]);
fvec4
blockAtomSigma
(
atomParameters
[
blockAtom
[
0
]].
first
,
atomParameters
[
blockAtom
[
1
]].
first
,
atomParameters
[
blockAtom
[
2
]].
first
,
atomParameters
[
blockAtom
[
3
]].
first
);
fvec4
blockAtomEpsilon
(
atomParameters
[
blockAtom
[
0
]].
second
,
atomParameters
[
blockAtom
[
1
]].
second
,
atomParameters
[
blockAtom
[
2
]].
second
,
atomParameters
[
blockAtom
[
3
]].
second
);
// Loop over neighbors for this block.
const
vector
<
int
>&
neighbors
=
neighborList
->
getBlockNeighbors
(
blockIndex
);
const
vector
<
char
>&
exclusions
=
neighborList
->
getBlockExclusions
(
blockIndex
);
float
blockAtomR2
[
4
];
bool
include
[
4
];
fvec4
blockAtomDelta
[
4
];
for
(
int
i
=
0
;
i
<
(
int
)
neighbors
.
size
();
i
++
)
{
// Load the next neighbor.
int
atom
=
neighbors
[
i
];
fvec4
atomPosq
(
posq
+
4
*
atom
);
// Compute the distances to the block atoms.
bool
any
=
false
;
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
getDeltaR
(
atomPosq
,
blockAtomPosq
[
j
],
blockAtomDelta
[
j
],
blockAtomR2
[
j
],
periodic
,
boxSize
,
invBoxSize
);
include
[
j
]
=
(((
exclusions
[
i
]
>>
j
)
&
1
)
==
0
&&
(
!
cutoff
||
blockAtomR2
[
j
]
<
cutoffDistance
*
cutoffDistance
));
any
|=
include
[
j
];
}
if
(
!
any
)
continue
;
// No interactions to compute.
// Compute the interactions.
fvec4
r2
(
blockAtomR2
);
fvec4
r
=
sqrt
(
r2
);
fvec4
inverseR
=
fvec4
(
1.0
f
)
/
r
;
fvec4
switchValue
(
1.0
f
),
switchDeriv
(
0.0
f
);
if
(
useSwitch
)
{
fvec4
t
=
(
r
>
switchingDistance
)
&
((
r
-
switchingDistance
)
/
(
cutoffDistance
-
switchingDistance
));
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
/
(
cutoffDistance
-
switchingDistance
);
}
float
chargeProd
=
ONE_4PI_EPS0
*
posq
[
4
*
ii
+
3
]
*
posq
[
4
*
jj
+
3
];
float
dEdR
=
chargeProd
*
inverseR
*
ewaldScaleFunction
(
r
);
float
sig
=
atomParameters
[
ii
].
first
+
atomParameters
[
jj
].
first
;
float
sig2
=
inverseR
*
sig
;
sig2
*=
sig2
;
float
sig6
=
sig2
*
sig2
*
sig2
;
float
eps
=
atomParameters
[
ii
].
second
*
atomParameters
[
jj
].
second
;
dEdR
+=
switchValue
*
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
fvec4
sig
=
blockAtomSigma
+
atomParameters
[
atom
].
first
;
fvec4
sig2
=
inverseR
*
sig
;
sig2
*=
sig2
;
fvec4
sig6
=
sig2
*
sig2
*
sig2
;
fvec4
eps
=
blockAtomEpsilon
*
atomParameters
[
atom
].
second
;
fvec4
dEdR
=
switchValue
*
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
fvec4
chargeProd
=
blockAtomCharge
*
posq
[
4
*
atom
+
3
];
if
(
cutoff
)
dEdR
+=
chargeProd
*
(
inverseR
-
2.0
f
*
krf
*
r2
);
else
dEdR
+=
chargeProd
*
inverseR
;
dEdR
*=
inverseR
*
inverseR
;
f
loat
energy
=
eps
*
(
sig6
-
1.0
f
)
*
sig6
;
f
vec4
energy
=
eps
*
(
sig6
-
1.0
f
)
*
sig6
;
if
(
useSwitch
)
{
dEdR
-=
energy
*
switchDeriv
*
inverseR
;
energy
*=
switchValue
;
}
//
a
ccumulate
forc
es
//
A
ccumulate
energi
es
.
__m128
result
=
_mm_mul_ps
(
deltaR
,
_mm_set1_ps
(
dEdR
));
_mm_storeu_ps
(
forces
+
4
*
ii
,
_mm_add_ps
(
_mm_loadu_ps
(
forces
+
4
*
ii
),
result
));
_mm_storeu_ps
(
forces
+
4
*
jj
,
_mm_sub_ps
(
_mm_loadu_ps
(
forces
+
4
*
jj
),
result
));
if
(
totalEnergy
)
{
if
(
cutoff
)
energy
+=
chargeProd
*
(
inverseR
+
krf
*
r2
-
crf
);
else
energy
+=
chargeProd
*
inverseR
;
for
(
int
j
=
0
;
j
<
4
;
j
++
)
if
(
include
[
j
])
*
totalEnergy
+=
energy
[
j
];
}
// accumulate energies
// Accumulate forces.
fvec4
atomForce
(
forces
+
4
*
atom
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
if
(
include
[
j
])
{
fvec4
result
=
blockAtomDelta
[
j
]
*
dEdR
[
j
];
blockAtomForce
[
j
]
+=
result
;
atomForce
-=
result
;
}
}
atomForce
.
store
(
forces
+
4
*
atom
);
}
// Record the forces on the block atoms.
for
(
int
j
=
0
;
j
<
4
;
j
++
)
(
fvec4
(
forces
+
4
*
blockAtom
[
j
])
+
blockAtomForce
[
j
]).
store
(
forces
+
4
*
blockAtom
[
j
]);
}
void
CpuNonbondedForce
::
calculateBlockEwaldIxn
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
// Load the positions and parameters of the atoms in the block.
int
blockAtom
[
4
];
fvec4
blockAtomPosq
[
4
];
fvec4
blockAtomForce
[
4
];
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
blockAtom
[
i
]
=
neighborList
->
getSortedAtoms
()[
4
*
blockIndex
+
i
];
blockAtomPosq
[
i
]
=
fvec4
(
posq
+
4
*
blockAtom
[
i
]);
blockAtomForce
[
i
]
=
fvec4
(
0.0
f
);
}
fvec4
blockAtomCharge
=
fvec4
(
ONE_4PI_EPS0
)
*
fvec4
(
blockAtomPosq
[
0
][
3
],
blockAtomPosq
[
1
][
3
],
blockAtomPosq
[
2
][
3
],
blockAtomPosq
[
3
][
3
]);
fvec4
blockAtomSigma
(
atomParameters
[
blockAtom
[
0
]].
first
,
atomParameters
[
blockAtom
[
1
]].
first
,
atomParameters
[
blockAtom
[
2
]].
first
,
atomParameters
[
blockAtom
[
3
]].
first
);
fvec4
blockAtomEpsilon
(
atomParameters
[
blockAtom
[
0
]].
second
,
atomParameters
[
blockAtom
[
1
]].
second
,
atomParameters
[
blockAtom
[
2
]].
second
,
atomParameters
[
blockAtom
[
3
]].
second
);
// Loop over neighbors for this block.
const
vector
<
int
>&
neighbors
=
neighborList
->
getBlockNeighbors
(
blockIndex
);
const
vector
<
char
>&
exclusions
=
neighborList
->
getBlockExclusions
(
blockIndex
);
float
blockAtomR2
[
4
];
bool
include
[
4
];
fvec4
blockAtomDelta
[
4
];
for
(
int
i
=
0
;
i
<
(
int
)
neighbors
.
size
();
i
++
)
{
// Load the next neighbor.
int
atom
=
neighbors
[
i
];
fvec4
atomPosq
(
posq
+
4
*
atom
);
// Compute the distances to the block atoms.
bool
any
=
false
;
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
getDeltaR
(
atomPosq
,
blockAtomPosq
[
j
],
blockAtomDelta
[
j
],
blockAtomR2
[
j
],
periodic
,
boxSize
,
invBoxSize
);
include
[
j
]
=
(((
exclusions
[
i
]
>>
j
)
&
1
)
==
0
&&
blockAtomR2
[
j
]
<
cutoffDistance
*
cutoffDistance
);
any
|=
include
[
j
];
}
if
(
!
any
)
continue
;
// No interactions to compute.
// Compute the interactions.
fvec4
r2
(
blockAtomR2
);
fvec4
r
=
sqrt
(
r2
);
fvec4
inverseR
=
fvec4
(
1.0
f
)
/
r
;
fvec4
switchValue
(
1.0
f
),
switchDeriv
(
0.0
f
);
if
(
useSwitch
)
{
fvec4
t
=
(
r
>
switchingDistance
)
&
((
r
-
switchingDistance
)
/
(
cutoffDistance
-
switchingDistance
));
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
/
(
cutoffDistance
-
switchingDistance
);
}
fvec4
chargeProd
=
blockAtomCharge
*
posq
[
4
*
atom
+
3
];
fvec4
dEdR
=
chargeProd
*
inverseR
*
ewaldScaleFunction
(
r
);
fvec4
sig
=
blockAtomSigma
+
atomParameters
[
atom
].
first
;
fvec4
sig2
=
inverseR
*
sig
;
sig2
*=
sig2
;
fvec4
sig6
=
sig2
*
sig2
*
sig2
;
fvec4
eps
=
blockAtomEpsilon
*
atomParameters
[
atom
].
second
;
dEdR
+=
switchValue
*
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
*=
inverseR
*
inverseR
;
fvec4
energy
=
eps
*
(
sig6
-
1.0
f
)
*
sig6
;
if
(
useSwitch
)
{
dEdR
-=
energy
*
switchDeriv
*
inverseR
;
energy
*=
switchValue
;
}
// Accumulate energies.
if
(
totalEnergy
)
{
energy
+=
(
float
)
(
chargeProd
*
inverseR
*
erfcApprox
(
alphaEwald
*
r
));
*
totalEnergy
+=
energy
;
energy
+=
chargeProd
*
inverseR
*
erfcApprox
(
alphaEwald
*
r
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
if
(
include
[
j
])
*
totalEnergy
+=
energy
[
j
];
}
// Accumulate forces.
fvec4
atomForce
(
forces
+
4
*
atom
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
if
(
include
[
j
])
{
fvec4
result
=
blockAtomDelta
[
j
]
*
dEdR
[
j
];
blockAtomForce
[
j
]
+=
result
;
atomForce
-=
result
;
}
}
atomForce
.
store
(
forces
+
4
*
atom
);
}
// Record the forces on the block atoms.
for
(
int
j
=
0
;
j
<
4
;
j
++
)
(
fvec4
(
forces
+
4
*
blockAtom
[
j
])
+
blockAtomForce
[
j
]).
store
(
forces
+
4
*
blockAtom
[
j
]);
}
void
CpuNonbondedForce
::
getDeltaR
(
const
__m128
&
posI
,
const
__m128
&
posJ
,
__m128
&
deltaR
,
float
&
r2
,
bool
periodic
,
const
__m128
&
boxSize
,
const
__m128
&
invBoxSize
)
const
{
deltaR
=
_mm_sub_ps
(
posJ
,
posI
)
;
void
CpuNonbondedForce
::
getDeltaR
(
const
fvec4
&
posI
,
const
fvec4
&
posJ
,
fvec4
&
deltaR
,
float
&
r2
,
bool
periodic
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
const
{
deltaR
=
posJ
-
posI
;
if
(
periodic
)
{
__m128
base
=
_mm_mul_ps
(
_mm_floor_ps
(
_mm_add_ps
(
_mm_mul_ps
(
deltaR
,
invBoxSize
)
,
_mm_set1_ps
(
0.5
))),
boxSize
)
;
deltaR
=
_mm_sub_ps
(
deltaR
,
base
)
;
fvec4
base
=
round
(
deltaR
*
invBoxSize
)
*
boxSize
;
deltaR
=
deltaR
-
base
;
}
r2
=
_mm_cvtss_f32
(
_mm_dp_ps
(
deltaR
,
deltaR
,
0x71
)
);
r2
=
dot3
(
deltaR
,
deltaR
);
}
f
loat
CpuNonbondedForce
::
erfcApprox
(
f
loat
x
)
{
f
vec4
CpuNonbondedForce
::
erfcApprox
(
f
vec4
x
)
{
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 3e-7.
f
loat
t
=
1.0
f
+
(
0.0705230784
f
+
(
0.0422820123
f
+
(
0.0092705272
f
+
(
0.0001520143
f
+
(
0.0002765672
f
+
0.0000430638
f
*
x
)
*
x
)
*
x
)
*
x
)
*
x
)
*
x
;
f
vec4
t
=
1.0
f
+
(
0.0705230784
f
+
(
0.0422820123
f
+
(
0.0092705272
f
+
(
0.0001520143
f
+
(
0.0002765672
f
+
0.0000430638
f
*
x
)
*
x
)
*
x
)
*
x
)
*
x
)
*
x
;
t
*=
t
;
t
*=
t
;
t
*=
t
;
return
1.0
f
/
(
t
*
t
);
}
f
loat
CpuNonbondedForce
::
ewaldScaleFunction
(
f
loat
x
)
{
f
vec4
CpuNonbondedForce
::
ewaldScaleFunction
(
f
vec4
x
)
{
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
int
lower
=
(
int
)
(
x
*
ewaldDXInv
);
int
upper
=
lower
+
1
;
float
a
=
(
ewaldScaleX
[
upper
]
-
x
)
*
ewaldDXInv
;
float
b
=
1.0
f
-
a
;
return
a
*
ewaldScaleY
[
lower
]
+
b
*
ewaldScaleY
[
upper
]
+
((
a
*
a
*
a
-
a
)
*
ewaldScaleDeriv
[
lower
]
+
(
b
*
b
*
b
-
b
)
*
ewaldScaleDeriv
[
upper
]);
float
y
[
4
];
fvec4
x1
=
x
*
ewaldDXInv
;
ivec4
index
=
floor
(
x1
);
fvec4
coeff
[
4
];
coeff
[
1
]
=
x1
-
index
;
coeff
[
0
]
=
1.0
f
-
coeff
[
1
];
coeff
[
2
]
=
coeff
[
0
]
*
coeff
[
0
]
*
coeff
[
0
]
-
coeff
[
0
];
coeff
[
3
]
=
coeff
[
1
]
*
coeff
[
1
]
*
coeff
[
1
]
-
coeff
[
1
];
_MM_TRANSPOSE4_PS
(
coeff
[
0
],
coeff
[
1
],
coeff
[
2
],
coeff
[
3
]);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
if
(
index
[
i
]
<
NUM_TABLE_POINTS
)
y
[
i
]
=
dot4
(
coeff
[
i
],
fvec4
(
&
ewaldScaleTable
[
4
*
index
[
i
]]));
}
return
fvec4
(
y
);
}
platforms/cpu/tests/TestCpuNeighborList.cpp
View file @
79fd69c3
...
...
@@ -39,6 +39,7 @@
#include "sfmt/SFMT.h"
#include <iostream>
#include <set>
#include <utility>
#include <vector>
using
namespace
OpenMM
;
...
...
@@ -68,10 +69,19 @@ void testNeighborList(bool periodic) {
// Convert the neighbor list to a set for faster lookup.
set
<
pair
<
int
,
int
>
>
neighbors
;
for
(
int
i
=
0
;
i
<
(
int
)
neighborList
.
getNeighbors
().
size
();
i
++
)
{
pair
<
int
,
int
>
entry
=
neighborList
.
getNeighbors
()[
i
];
ASSERT
(
neighbors
.
find
(
entry
)
==
neighbors
.
end
()
&&
neighbors
.
find
(
make_pair
(
entry
.
second
,
entry
.
first
))
==
neighbors
.
end
());
// No duplicates
neighbors
.
insert
(
entry
);
for
(
int
i
=
0
;
i
<
(
int
)
neighborList
.
getSortedAtoms
().
size
();
i
++
)
{
int
blockIndex
=
i
/
CpuNeighborList
::
BlockSize
;
int
indexInBlock
=
i
-
blockIndex
*
CpuNeighborList
::
BlockSize
;
char
mask
=
1
<<
indexInBlock
;
for
(
int
j
=
0
;
j
<
(
int
)
neighborList
.
getBlockExclusions
(
blockIndex
).
size
();
j
++
)
{
if
((
neighborList
.
getBlockExclusions
(
blockIndex
)[
j
]
&
mask
)
==
0
)
{
int
atom1
=
neighborList
.
getSortedAtoms
()[
i
];
int
atom2
=
neighborList
.
getBlockNeighbors
(
blockIndex
)[
j
];
pair
<
int
,
int
>
entry
=
make_pair
(
min
(
atom1
,
atom2
),
max
(
atom1
,
atom2
));
ASSERT
(
neighbors
.
find
(
entry
)
==
neighbors
.
end
()
&&
neighbors
.
find
(
make_pair
(
entry
.
second
,
entry
.
first
))
==
neighbors
.
end
());
// No duplicates
neighbors
.
insert
(
entry
);
}
}
}
// Check each particle pair and figure out whether they should be in the neighbor list.
...
...
@@ -90,7 +100,8 @@ void testNeighborList(bool periodic) {
if
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
>
cutoff
*
cutoff
)
shouldInclude
=
false
;
bool
isIncluded
=
(
neighbors
.
find
(
make_pair
(
i
,
j
))
!=
neighbors
.
end
()
||
neighbors
.
find
(
make_pair
(
j
,
i
))
!=
neighbors
.
end
());
ASSERT_EQUAL
(
shouldInclude
,
isIncluded
);
if
(
shouldInclude
)
ASSERT
(
isIncluded
);
}
}
...
...
plugins/cpupme/src/CpuPmeKernels.cpp
View file @
79fd69c3
...
...
@@ -35,9 +35,9 @@
#include "CpuPmeKernels.h"
#include "SimTKOpenMMRealType.h"
#include "openmm/internal/hardware.h"
#include "openmm/internal/vectorize.h"
#include <cmath>
#include <cstring>
#include <smmintrin.h>
using
namespace
OpenMM
;
using
namespace
std
;
...
...
@@ -47,52 +47,50 @@ static const int PME_ORDER = 5;
bool
CpuCalcPmeReciprocalForceKernel
::
hasInitializedThreads
=
false
;
int
CpuCalcPmeReciprocalForceKernel
::
numThreads
=
0
;
#define EXTRACT_FLOAT(v, element) _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, element)))
static
void
spreadCharge
(
int
start
,
int
end
,
float
*
posq
,
float
*
grid
,
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
Vec3
periodicBoxSize
)
{
float
temp
[
4
];
__m128
boxSize
=
_mm_set_ps
(
0
,
(
float
)
periodicBoxSize
[
2
],
(
float
)
periodicBoxSize
[
1
],
(
float
)
periodicBoxSize
[
0
]
);
__m128
invBoxSize
=
_mm_set_ps
(
0
,
(
float
)
(
1
/
periodicBoxSize
[
2
]),
(
float
)
(
1
/
periodicBoxSize
[
1
]),
(
float
)
(
1
/
periodicBoxSize
[
0
]));
__m128
gridSize
=
_mm_set_ps
(
0
,
grid
z
,
gridy
,
grid
x
);
__m128i
gridSizeInt
=
_mm_set_epi32
(
0
,
grid
z
,
gridy
,
grid
x
);
__m128
one
=
_mm_set1_ps
(
1
);
__m128
scale
=
_mm_set1_ps
(
1.0
f
/
(
PME_ORDER
-
1
));
fvec4
boxSize
(
(
float
)
periodicBoxSize
[
0
],
(
float
)
periodicBoxSize
[
1
],
(
float
)
periodicBoxSize
[
2
],
0
);
fvec4
invBoxSize
(
(
float
)
(
1
/
periodicBoxSize
[
0
]),
(
float
)
(
1
/
periodicBoxSize
[
1
]),
(
float
)
(
1
/
periodicBoxSize
[
2
])
,
0
);
fvec4
gridSize
(
grid
x
,
gridy
,
grid
z
,
0
);
ivec4
gridSizeInt
(
grid
x
,
gridy
,
grid
z
,
0
);
fvec4
one
(
1
);
fvec4
scale
(
1.0
f
/
(
PME_ORDER
-
1
));
const
float
epsilonFactor
=
sqrt
(
ONE_4PI_EPS0
);
memset
(
grid
,
0
,
sizeof
(
float
)
*
gridx
*
gridy
*
gridz
);
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
// Find the position relative to the nearest grid point.
__m128
pos
=
_mm_loadu_p
s
(
&
posq
[
4
*
i
]);
__m128
posFloor
=
_mm_
floor
_ps
(
_mm_mul_ps
(
pos
,
invBoxSize
)
)
;
__m128
posInBox
=
_mm_sub_ps
(
pos
,
_mm_mul_ps
(
boxSize
,
posFloor
))
;
__m128
t
=
_mm_mul_ps
(
_mm_mul_ps
(
posInBox
,
invBoxSize
),
gridSize
)
;
__m128i
ti
=
_mm_cvttps_epi32
(
t
)
;
__m128
dr
=
_mm_sub_ps
(
t
,
_mm_cvtepi32_ps
(
ti
))
;
__m128i
gridIndex
=
_mm_sub_epi32
(
ti
,
_mm_and_si128
(
gridSizeInt
,
_mm_cmpeq_epi32
(
ti
,
gridSizeInt
)
))
;
fvec4
po
s
(
&
posq
[
4
*
i
]);
fvec4
posFloor
=
floor
(
pos
*
invBoxSize
);
fvec4
posInBox
=
pos
-
boxSize
*
posFloor
;
fvec4
t
=
posInBox
*
invBoxSize
*
gridSize
;
ivec4
ti
=
t
;
fvec4
dr
=
t
-
ti
;
ivec4
gridIndex
=
ti
-
(
gridSizeInt
&
ti
==
gridSizeInt
);
// Compute the B-spline coefficients.
__m128
data
[
PME_ORDER
];
data
[
PME_ORDER
-
1
]
=
_mm_setzero_ps
()
;
fvec4
data
[
PME_ORDER
];
data
[
PME_ORDER
-
1
]
=
0.0
f
;
data
[
1
]
=
dr
;
data
[
0
]
=
_mm_sub_ps
(
one
,
dr
)
;
data
[
0
]
=
one
-
dr
;
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
__m128
div
=
_mm_set1_ps
(
1.0
f
/
(
j
-
1
));
data
[
j
-
1
]
=
_mm_mul_ps
(
_mm_mul_ps
(
div
,
dr
),
data
[
j
-
2
]
)
;
fvec4
div
(
1.0
f
/
(
j
-
1
));
data
[
j
-
1
]
=
div
*
dr
*
data
[
j
-
2
];
for
(
int
k
=
1
;
k
<
j
-
1
;
k
++
)
data
[
j
-
k
-
1
]
=
_mm_mul_ps
(
div
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_add_ps
(
dr
,
_mm_set1_ps
(
k
)),
data
[
j
-
k
-
2
]),
_mm_mul_ps
(
_mm_sub_ps
(
_mm_set1_ps
(
j
-
k
)
,
dr
)
,
data
[
j
-
k
-
1
])
))
;
data
[
0
]
=
_mm_mul_ps
(
_mm_mul_ps
(
div
,
_mm_sub_ps
(
one
,
dr
)
),
data
[
0
]
)
;
data
[
j
-
k
-
1
]
=
div
*
((
dr
+
k
)
*
data
[
j
-
k
-
2
]
+
(
fvec4
(
j
-
k
)
-
dr
)
*
data
[
j
-
k
-
1
]);
data
[
0
]
=
div
*
(
one
-
dr
)
*
data
[
0
];
}
data
[
PME_ORDER
-
1
]
=
_mm_mul_ps
(
_mm_mul_ps
(
scale
,
dr
),
data
[
PME_ORDER
-
2
]
)
;
data
[
PME_ORDER
-
1
]
=
scale
*
dr
*
data
[
PME_ORDER
-
2
];
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
data
[
PME_ORDER
-
j
-
1
]
=
_mm_mul_ps
(
scale
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_add_ps
(
dr
,
_mm_set1_ps
(
j
)),
data
[
PME_ORDER
-
j
-
2
]),
_mm_mul_ps
(
_mm_sub_ps
(
_mm_set1_ps
(
PME_ORDER
-
j
)
,
dr
)
,
data
[
PME_ORDER
-
j
-
1
])
))
;
data
[
0
]
=
_mm_mul_ps
(
_mm_mul_ps
(
scale
,
_mm_sub_ps
(
one
,
dr
)
),
data
[
0
]
)
;
data
[
PME_ORDER
-
j
-
1
]
=
scale
*
((
dr
+
j
)
*
data
[
PME_ORDER
-
j
-
2
]
+
(
fvec4
(
PME_ORDER
-
j
)
-
dr
)
*
data
[
PME_ORDER
-
j
-
1
]);
data
[
0
]
=
scale
*
(
one
-
dr
)
*
data
[
0
];
// Spread the charges.
int
gridIndexX
=
_mm_extract_epi32
(
gridIndex
,
0
)
;
int
gridIndexY
=
_mm_extract_epi32
(
gridIndex
,
1
)
;
int
gridIndexZ
=
_mm_extract_epi32
(
gridIndex
,
2
)
;
int
gridIndexX
=
gridIndex
[
0
]
;
int
gridIndexY
=
gridIndex
[
1
]
;
int
gridIndexZ
=
gridIndex
[
2
]
;
if
(
gridIndexX
<
0
)
return
;
// This happens when a simulation blows up and coordinates become NaN.
int
zindex
[
PME_ORDER
];
...
...
@@ -101,21 +99,21 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx
zindex
[
j
]
-=
(
zindex
[
j
]
>=
gridz
?
gridz
:
0
);
}
float
charge
=
epsilonFactor
*
posq
[
4
*
i
+
3
];
__m128
zdata0to3
=
_mm_set_ps
(
EXTRACT_FLOAT
(
data
[
3
],
2
),
EXTRACT_FLOAT
(
data
[
2
],
2
),
EXTRACT_FLOAT
(
data
[
1
],
2
),
EXTRACT_FLOAT
(
data
[
0
],
2
)
);
float
zdata4
=
EXTRACT_FLOAT
(
data
[
4
]
,
2
)
;
fvec4
zdata0to3
(
data
[
0
][
2
],
data
[
1
][
2
],
data
[
2
][
2
],
data
[
3
][
2
]
);
float
zdata4
=
data
[
4
]
[
2
]
;
if
(
gridIndexZ
+
4
<
gridz
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndexX
+
ix
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
=
xbase
*
gridy
*
gridz
;
float
xdata
=
charge
*
EXTRACT_FLOAT
(
data
[
ix
]
,
0
)
;
float
xdata
=
charge
*
data
[
ix
]
[
0
]
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
float
multiplier
=
xdata
*
EXTRACT_FLOAT
(
data
[
iy
]
,
1
)
;
__m128
add0to3
=
_mm_mul_ps
(
zdata0to3
,
_mm_set1_ps
(
multiplier
))
;
_mm_storeu_ps
(
&
grid
[
ybase
+
gridIndexZ
]
,
_mm_add_ps
(
_mm_loadu_ps
(
&
grid
[
ybase
+
gridIndexZ
])
,
add0to3
))
;
float
multiplier
=
xdata
*
data
[
iy
]
[
1
]
;
fvec4
add0to3
=
zdata0to3
*
multiplier
;
(
fvec4
(
&
grid
[
ybase
+
gridIndexZ
]
)
+
add0to3
).
store
(
&
grid
[
ybase
+
gridIndexZ
]);
grid
[
ybase
+
zindex
[
4
]]
+=
multiplier
*
zdata4
;
}
}
...
...
@@ -125,14 +123,14 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx
int
xbase
=
gridIndexX
+
ix
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
=
xbase
*
gridy
*
gridz
;
float
xdata
=
charge
*
EXTRACT_FLOAT
(
data
[
ix
]
,
0
)
;
float
xdata
=
charge
*
data
[
ix
]
[
0
]
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
float
multiplier
=
xdata
*
EXTRACT_FLOAT
(
data
[
iy
]
,
1
)
;
__m128
add0to3
=
_mm_mul_ps
(
zdata0to3
,
_mm_set1_ps
(
multiplier
))
;
_mm_
store
_ps
(
temp
,
add0to3
);
float
multiplier
=
xdata
*
data
[
iy
]
[
1
]
;
fvec4
add0to3
=
zdata0to3
*
multiplier
;
add0to3
.
store
(
temp
);
grid
[
ybase
+
zindex
[
0
]]
+=
temp
[
0
];
grid
[
ybase
+
zindex
[
1
]]
+=
temp
[
1
];
grid
[
ybase
+
zindex
[
2
]]
+=
temp
[
2
];
...
...
@@ -245,51 +243,51 @@ static void reciprocalConvolution(int start, int end, fftwf_complex* grid, int g
}
static
void
interpolateForces
(
int
start
,
int
end
,
float
*
posq
,
float
*
force
,
float
*
grid
,
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
Vec3
periodicBoxSize
)
{
__m128
boxSize
=
_mm_set_ps
(
0
,
(
float
)
periodicBoxSize
[
2
],
(
float
)
periodicBoxSize
[
1
],
(
float
)
periodicBoxSize
[
0
]
);
__m128
invBoxSize
=
_mm_set_ps
(
0
,
(
float
)
(
1
/
periodicBoxSize
[
2
]),
(
float
)
(
1
/
periodicBoxSize
[
1
]),
(
float
)
(
1
/
periodicBoxSize
[
0
]));
__m128
gridSize
=
_mm_set_ps
(
0
,
grid
z
,
gridy
,
grid
x
);
__m128i
gridSizeInt
=
_mm_set_epi32
(
0
,
grid
z
,
gridy
,
grid
x
);
__m128
one
=
_mm_set1_ps
(
1
);
__m128
scale
=
_mm_set1_ps
(
1.0
f
/
(
PME_ORDER
-
1
));
fvec4
boxSize
(
(
float
)
periodicBoxSize
[
0
],
(
float
)
periodicBoxSize
[
1
],
(
float
)
periodicBoxSize
[
2
],
0
);
fvec4
invBoxSize
(
(
float
)
(
1
/
periodicBoxSize
[
0
]),
(
float
)
(
1
/
periodicBoxSize
[
1
]),
(
float
)
(
1
/
periodicBoxSize
[
2
])
,
0
);
fvec4
gridSize
(
grid
x
,
gridy
,
grid
z
,
0
);
ivec4
gridSizeInt
(
grid
x
,
gridy
,
grid
z
,
0
);
fvec4
one
(
1
);
fvec4
scale
(
1.0
f
/
(
PME_ORDER
-
1
));
const
float
epsilonFactor
=
sqrt
(
ONE_4PI_EPS0
);
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
// Find the position relative to the nearest grid point.
__m128
pos
=
_mm_loadu_p
s
(
&
posq
[
4
*
i
]);
__m128
posFloor
=
_mm_
floor
_ps
(
_mm_mul_ps
(
pos
,
invBoxSize
)
)
;
__m128
posInBox
=
_mm_sub_ps
(
pos
,
_mm_mul_ps
(
boxSize
,
posFloor
))
;
__m128
t
=
_mm_mul_ps
(
_mm_mul_ps
(
posInBox
,
invBoxSize
),
gridSize
)
;
__m128i
ti
=
_mm_cvttps_epi32
(
t
)
;
__m128
dr
=
_mm_sub_ps
(
t
,
_mm_cvtepi32_ps
(
ti
))
;
__m128i
gridIndex
=
_mm_sub_epi32
(
ti
,
_mm_and_si128
(
gridSizeInt
,
_mm_cmpeq_epi32
(
ti
,
gridSizeInt
)
))
;
fvec4
po
s
(
&
posq
[
4
*
i
]);
fvec4
posFloor
=
floor
(
pos
*
invBoxSize
);
fvec4
posInBox
=
pos
-
boxSize
*
posFloor
;
fvec4
t
=
posInBox
*
invBoxSize
*
gridSize
;
ivec4
ti
=
t
;
fvec4
dr
=
t
-
ti
;
ivec4
gridIndex
=
ti
-
(
gridSizeInt
&
ti
==
gridSizeInt
);
// Compute the B-spline coefficients.
__m128
data
[
PME_ORDER
];
__m128
ddata
[
PME_ORDER
];
data
[
PME_ORDER
-
1
]
=
_mm_setzero_ps
()
;
fvec4
data
[
PME_ORDER
];
fvec4
ddata
[
PME_ORDER
];
data
[
PME_ORDER
-
1
]
=
0.0
f
;
data
[
1
]
=
dr
;
data
[
0
]
=
_mm_sub_ps
(
one
,
dr
)
;
data
[
0
]
=
one
-
dr
;
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
__m128
div
=
_mm_set1_ps
(
1.0
f
/
(
j
-
1
));
data
[
j
-
1
]
=
_mm_mul_ps
(
_mm_mul_ps
(
div
,
dr
),
data
[
j
-
2
]
)
;
fvec4
div
(
1.0
f
/
(
j
-
1
));
data
[
j
-
1
]
=
div
*
dr
*
data
[
j
-
2
];
for
(
int
k
=
1
;
k
<
j
-
1
;
k
++
)
data
[
j
-
k
-
1
]
=
_mm_mul_ps
(
div
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_add_ps
(
dr
,
_mm_set1_ps
(
k
)),
data
[
j
-
k
-
2
]),
_mm_mul_ps
(
_mm_sub_ps
(
_mm_set1_ps
(
j
-
k
)
,
dr
)
,
data
[
j
-
k
-
1
])
))
;
data
[
0
]
=
_mm_mul_ps
(
_mm_mul_ps
(
div
,
_mm_sub_ps
(
one
,
dr
)
),
data
[
0
]
)
;
data
[
j
-
k
-
1
]
=
div
*
((
dr
+
k
)
*
data
[
j
-
k
-
2
]
+
(
fvec4
(
j
-
k
)
-
dr
)
*
data
[
j
-
k
-
1
]);
data
[
0
]
=
div
*
(
one
-
dr
)
*
data
[
0
];
}
ddata
[
0
]
=
_mm_sub_ps
(
_mm_set1_ps
(
0
),
data
[
0
]
)
;
ddata
[
0
]
=
-
data
[
0
];
for
(
int
j
=
1
;
j
<
PME_ORDER
;
j
++
)
ddata
[
j
]
=
_mm_sub_ps
(
data
[
j
-
1
]
,
data
[
j
]
)
;
data
[
PME_ORDER
-
1
]
=
_mm_mul_ps
(
_mm_mul_ps
(
scale
,
dr
),
data
[
PME_ORDER
-
2
]
)
;
ddata
[
j
]
=
data
[
j
-
1
]
-
data
[
j
];
data
[
PME_ORDER
-
1
]
=
scale
*
dr
*
data
[
PME_ORDER
-
2
];
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
data
[
PME_ORDER
-
j
-
1
]
=
_mm_mul_ps
(
scale
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_add_ps
(
dr
,
_mm_set1_ps
(
j
)),
data
[
PME_ORDER
-
j
-
2
]),
_mm_mul_ps
(
_mm_sub_ps
(
_mm_set1_ps
(
PME_ORDER
-
j
)
,
dr
)
,
data
[
PME_ORDER
-
j
-
1
])
))
;
data
[
0
]
=
_mm_mul_ps
(
_mm_mul_ps
(
scale
,
_mm_sub_ps
(
one
,
dr
)
),
data
[
0
]
)
;
data
[
PME_ORDER
-
j
-
1
]
=
scale
*
((
dr
+
j
)
*
data
[
PME_ORDER
-
j
-
2
]
+
(
fvec4
(
PME_ORDER
-
j
)
-
dr
)
*
data
[
PME_ORDER
-
j
-
1
]);
data
[
0
]
=
scale
*
(
one
-
dr
)
*
data
[
0
];
// Compute the force on this atom.
int
gridIndexX
=
_mm_extract_epi32
(
gridIndex
,
0
)
;
int
gridIndexY
=
_mm_extract_epi32
(
gridIndex
,
1
)
;
int
gridIndexZ
=
_mm_extract_epi32
(
gridIndex
,
2
)
;
int
gridIndexX
=
gridIndex
[
0
]
;
int
gridIndexY
=
gridIndex
[
1
]
;
int
gridIndexZ
=
gridIndex
[
2
]
;
if
(
gridIndexX
<
0
)
return
;
// This happens when a simulation blows up and coordinates become NaN.
int
zindex
[
PME_ORDER
];
...
...
@@ -297,34 +295,34 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo
zindex
[
j
]
=
gridIndexZ
+
j
;
zindex
[
j
]
-=
(
zindex
[
j
]
>=
gridz
?
gridz
:
0
);
}
__m128
zdata
[
PME_ORDER
];
fvec4
zdata
[
PME_ORDER
];
for
(
int
j
=
0
;
j
<
PME_ORDER
;
j
++
)
zdata
[
j
]
=
_mm_set_ps
(
0
,
EXTRACT_FLOAT
(
ddata
[
j
],
2
),
EXTRACT_FLOAT
(
data
[
j
],
2
),
EXTRACT_FLOAT
(
data
[
j
]
,
2
)
);
__m128
f
=
_mm_set1_ps
(
0
)
;
zdata
[
j
]
=
fvec4
(
data
[
j
][
2
],
data
[
j
][
2
],
d
data
[
j
]
[
2
],
0
);
fvec4
f
=
0.0
f
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndexX
+
ix
;
xbase
-=
(
xbase
>=
gridx
?
gridx
:
0
);
xbase
=
xbase
*
gridy
*
gridz
;
float
dx
=
EXTRACT_FLOAT
(
data
[
ix
]
,
0
)
;
float
ddx
=
EXTRACT_FLOAT
(
ddata
[
ix
]
,
0
)
;
__m128
xdata
=
_mm_set_ps
(
0
,
dx
,
dx
,
ddx
);
float
dx
=
data
[
ix
]
[
0
]
;
float
ddx
=
ddata
[
ix
]
[
0
]
;
fvec4
xdata
(
ddx
,
dx
,
dx
,
0
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndexY
+
iy
;
ybase
-=
(
ybase
>=
gridy
?
gridy
:
0
);
ybase
=
xbase
+
ybase
*
gridz
;
float
dy
=
EXTRACT_FLOAT
(
data
[
iy
]
,
1
)
;
float
ddy
=
EXTRACT_FLOAT
(
ddata
[
iy
]
,
1
)
;
__m128
xydata
=
_mm_mul_ps
(
xdata
,
_mm_set_ps
(
0
,
dy
,
ddy
,
dy
)
);
float
dy
=
data
[
iy
]
[
1
]
;
float
ddy
=
ddata
[
iy
]
[
1
]
;
fvec4
xydata
=
xdata
*
fvec4
(
dy
,
ddy
,
dy
,
0
);
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
__m128
gridValue
=
_mm_set1_ps
(
grid
[
ybase
+
zindex
[
iz
]]);
f
=
_mm_add_ps
(
f
,
_mm_mul_ps
(
xydata
,
_mm_mul_ps
(
zdata
[
iz
]
,
gridValue
)))
;
fvec4
gridValue
(
grid
[
ybase
+
zindex
[
iz
]]);
f
=
f
+
xydata
*
zdata
[
iz
]
*
gridValue
;
}
}
}
f
=
_mm_mul_ps
(
invBoxSize
,
_mm_mul_ps
(
gridSize
,
_mm_mul_ps
(
f
,
_mm_set1_ps
(
-
epsilonFactor
*
posq
[
4
*
i
+
3
])
)))
;
_mm_
store
_ps
(
&
force
[
4
*
i
]
,
f
);
f
=
invBoxSize
*
gridSize
*
f
*
(
-
epsilonFactor
*
posq
[
4
*
i
+
3
]);
f
.
store
(
&
force
[
4
*
i
]
);
}
}
...
...
@@ -509,10 +507,10 @@ void CpuCalcPmeReciprocalForceKernel::runThread(int index) {
threadWait
();
int
numGrids
=
threadData
.
size
();
for
(
int
i
=
gridStart
;
i
<
gridEnd
;
i
+=
4
)
{
__m128
sum
=
_mm_load_ps
(
&
realGrid
[
i
]);
fvec4
sum
(
&
realGrid
[
i
]);
for
(
int
j
=
1
;
j
<
numGrids
;
j
++
)
sum
=
_mm_add_ps
(
sum
,
_mm_load_ps
(
&
threadData
[
j
]
->
tempGrid
[
i
])
)
;
_mm_
store
_ps
(
&
realGrid
[
i
]
,
sum
);
sum
+
=
fvec4
(
&
threadData
[
j
]
->
tempGrid
[
i
]);
sum
.
store
(
&
realGrid
[
i
]);
}
threadWait
();
if
(
lastBoxSize
!=
periodicBoxSize
)
{
...
...
wrappers/python/src/swig_doxygen/swigInputConfig.py
View file @
79fd69c3
...
...
@@ -145,6 +145,8 @@ SKIP_METHODS = [('State',),
(
'IntegrateDrudeSCFStepKernel'
,),
(
'XmlSerializer'
,
'serialize'
),
(
'XmlSerializer'
,
'deserialize'
),
(
'fvec4'
,),
(
'ivec4'
,),
]
# The build script assumes method args that are non-const references are
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment