Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
588a4c9d
Commit
588a4c9d
authored
Dec 10, 2013
by
peastman
Browse files
Merged changes from main branch
parents
bbf3de23
ff6af025
Changes
77
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2173 additions
and
212 deletions
+2173
-212
CMakeLists.txt
CMakeLists.txt
+14
-10
examples/CMakeLists.txt
examples/CMakeLists.txt
+17
-11
openmmapi/include/openmm/internal/vectorize.h
openmmapi/include/openmm/internal/vectorize.h
+12
-0
platforms/cpu/CMakeLists.txt
platforms/cpu/CMakeLists.txt
+0
-4
platforms/cpu/include/AlignedArray.h
platforms/cpu/include/AlignedArray.h
+104
-0
platforms/cpu/include/CpuGBSAOBCForce.h
platforms/cpu/include/CpuGBSAOBCForce.h
+6
-2
platforms/cpu/include/CpuNeighborList.h
platforms/cpu/include/CpuNeighborList.h
+2
-1
platforms/cpu/include/CpuNonbondedForce.h
platforms/cpu/include/CpuNonbondedForce.h
+5
-2
platforms/cpu/include/CpuPlatform.h
platforms/cpu/include/CpuPlatform.h
+3
-2
platforms/cpu/include/CpuSETTLE.h
platforms/cpu/include/CpuSETTLE.h
+78
-0
platforms/cpu/sharedTarget/CMakeLists.txt
platforms/cpu/sharedTarget/CMakeLists.txt
+2
-3
platforms/cpu/src/CpuGBSAOBCForce.cpp
platforms/cpu/src/CpuGBSAOBCForce.cpp
+49
-45
platforms/cpu/src/CpuKernels.cpp
platforms/cpu/src/CpuKernels.cpp
+6
-6
platforms/cpu/src/CpuNeighborList.cpp
platforms/cpu/src/CpuNeighborList.cpp
+38
-13
platforms/cpu/src/CpuNonbondedForce.cpp
platforms/cpu/src/CpuNonbondedForce.cpp
+125
-108
platforms/cpu/src/CpuPlatform.cpp
platforms/cpu/src/CpuPlatform.cpp
+9
-2
platforms/cpu/src/CpuSETTLE.cpp
platforms/cpu/src/CpuSETTLE.cpp
+104
-0
platforms/cpu/src/gmx_atomic.h
platforms/cpu/src/gmx_atomic.h
+1596
-0
platforms/cpu/tests/CMakeLists.txt
platforms/cpu/tests/CMakeLists.txt
+1
-2
platforms/cpu/tests/TestCpuNeighborList.cpp
platforms/cpu/tests/TestCpuNeighborList.cpp
+2
-1
No files found.
CMakeLists.txt
View file @
588a4c9d
...
@@ -105,16 +105,20 @@ ELSE( CMAKE_SIZEOF_VOID_P EQUAL 8 )
...
@@ -105,16 +105,20 @@ ELSE( CMAKE_SIZEOF_VOID_P EQUAL 8 )
SET
(
LIB64
)
SET
(
LIB64
)
ENDIF
(
CMAKE_SIZEOF_VOID_P EQUAL 8
)
ENDIF
(
CMAKE_SIZEOF_VOID_P EQUAL 8
)
# Build universal binaries compatible with OS X 10.7
IF
(
APPLE
)
IF
(
APPLE AND NOT CMAKE_OSX_DEPLOYMENT_TARGET
)
# Build universal binaries compatible with OS X 10.7
SE
T
(
CMAKE_OSX_
ARCHITECTURES
"i386;x86_64"
CACHE STRING
"The processor architectures to build for"
FORCE
)
IF
(
NO
T CMAKE_OSX_
DEPLOYMENT_TARGET
)
SET
(
CMAKE_OSX_DEPLOYMENT_TARGET
"10.7"
CACHE STRING
"The minimum version of OS X to support"
FORCE
)
SET
(
CMAKE_OSX_DEPLOYMENT_TARGET
"10.7"
CACHE STRING
"The minimum version of OS X to support"
FORCE
)
ENDIF
(
APPLE AND NOT CMAKE_OSX_DEPLOYMENT_TARGET
)
ENDIF
(
NOT CMAKE_OSX_DEPLOYMENT_TARGET
)
IF
(
NOT CMAKE_OSX_ARCHITECTURES
)
SET
(
CMAKE_OSX_ARCHITECTURES
"i386;x86_64"
CACHE STRING
"The processor architectures to build for"
FORCE
)
ENDIF
(
NOT CMAKE_OSX_ARCHITECTURES
)
# Improve the linking behavior of Mac libraries
# Improve the linking behavior of Mac libraries
IF
(
APPLE
)
SET
(
CMAKE_INSTALL_NAME_DIR
"@rpath"
)
SET
(
CMAKE_INSTALL_NAME_DIR
"@rpath"
)
SET_PROPERTY
(
GLOBAL PROPERTY COMPILE_FLAGS
"-stdlib=libc++ -mmacosx-version-min=10.7"
)
SET
(
EXTRA_COMPILE_FLAGS
"-stdlib=libc++"
)
ELSE
(
APPLE
)
SET
(
EXTRA_COMPILE_FLAGS
)
ENDIF
(
APPLE
)
ENDIF
(
APPLE
)
IF
(
UNIX AND NOT CMAKE_BUILD_TYPE
)
IF
(
UNIX AND NOT CMAKE_BUILD_TYPE
)
...
@@ -266,7 +270,7 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
...
@@ -266,7 +270,7 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/src
)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/src
)
ADD_LIBRARY
(
${
SHARED_TARGET
}
SHARED
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
ADD_LIBRARY
(
${
SHARED_TARGET
}
SHARED
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
SET_TARGET_PROPERTIES
(
${
SHARED_TARGET
}
PROPERTIES
COMPILE_FLAGS
"
-DOPENMM_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY -DOPENMM_VALIDATE_BUILDING_SHARED_LIBRARY"
)
SET_TARGET_PROPERTIES
(
${
SHARED_TARGET
}
PROPERTIES
LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-DOPENMM_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY -DOPENMM_VALIDATE_BUILDING_SHARED_LIBRARY"
)
IF
(
WIN32
)
IF
(
WIN32
)
ADD_DEPENDENCIES
(
${
SHARED_TARGET
}
PthreadsLibraries
)
ADD_DEPENDENCIES
(
${
SHARED_TARGET
}
PthreadsLibraries
)
ENDIF
(
WIN32
)
ENDIF
(
WIN32
)
...
@@ -274,7 +278,7 @@ ENDIF(WIN32)
...
@@ -274,7 +278,7 @@ ENDIF(WIN32)
SET
(
OPENMM_BUILD_STATIC_LIB OFF CACHE BOOL
"Whether to build static OpenMM libraries"
)
SET
(
OPENMM_BUILD_STATIC_LIB OFF CACHE BOOL
"Whether to build static OpenMM libraries"
)
IF
(
OPENMM_BUILD_STATIC_LIB
)
IF
(
OPENMM_BUILD_STATIC_LIB
)
ADD_LIBRARY
(
${
STATIC_TARGET
}
STATIC
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
ADD_LIBRARY
(
${
STATIC_TARGET
}
STATIC
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
SET_TARGET_PROPERTIES
(
${
STATIC_TARGET
}
PROPERTIES
COMPILE_FLAGS
"
-DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY -DOPENMMM_VALIDATE_BUILDING_STATIC_LIBRARY -DOPENMM_VALIDATE_BUILDING_STATIC_LIBRARY"
)
SET_TARGET_PROPERTIES
(
${
STATIC_TARGET
}
PROPERTIES
LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY -DOPENMMM_VALIDATE_BUILDING_STATIC_LIBRARY -DOPENMM_VALIDATE_BUILDING_STATIC_LIBRARY"
)
ENDIF
(
OPENMM_BUILD_STATIC_LIB
)
ENDIF
(
OPENMM_BUILD_STATIC_LIB
)
IF
(
OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS
)
IF
(
OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS
)
...
...
examples/CMakeLists.txt
View file @
588a4c9d
...
@@ -33,7 +33,9 @@ FOREACH(EX_ROOT ${CPP_EXAMPLES})
...
@@ -33,7 +33,9 @@ FOREACH(EX_ROOT ${CPP_EXAMPLES})
ADD_EXECUTABLE
(
${
EX_ROOT
}
${
EX_ROOT
}
.cpp
)
ADD_EXECUTABLE
(
${
EX_ROOT
}
${
EX_ROOT
}
.cpp
)
SET_TARGET_PROPERTIES
(
${
EX_ROOT
}
SET_TARGET_PROPERTIES
(
${
EX_ROOT
}
PROPERTIES
PROPERTIES
PROJECT_LABEL
"Example -
${
EX_ROOT
}
"
)
PROJECT_LABEL
"Example -
${
EX_ROOT
}
"
LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
)
TARGET_LINK_LIBRARIES
(
${
EX_ROOT
}
${
SHARED_TARGET
}
)
TARGET_LINK_LIBRARIES
(
${
EX_ROOT
}
${
SHARED_TARGET
}
)
ENDIF
(
BUILD_TESTING_SHARED
)
ENDIF
(
BUILD_TESTING_SHARED
)
...
@@ -43,8 +45,9 @@ FOREACH(EX_ROOT ${CPP_EXAMPLES})
...
@@ -43,8 +45,9 @@ FOREACH(EX_ROOT ${CPP_EXAMPLES})
ADD_EXECUTABLE
(
${
EX_STATIC
}
${
EX_ROOT
}
.cpp
)
ADD_EXECUTABLE
(
${
EX_STATIC
}
${
EX_ROOT
}
.cpp
)
SET_TARGET_PROPERTIES
(
${
EX_STATIC
}
SET_TARGET_PROPERTIES
(
${
EX_STATIC
}
PROPERTIES
PROPERTIES
COMPILE_FLAGS
"-DOPENMM_USE_STATIC_LIBRARIES"
PROJECT_LABEL
"Example -
${
EX_STATIC
}
"
PROJECT_LABEL
"Example -
${
EX_STATIC
}
"
)
LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-DOPENMM_USE_STATIC_LIBRARIES"
)
TARGET_LINK_LIBRARIES
(
${
EX_STATIC
}
${
STATIC_TARGET
}
)
TARGET_LINK_LIBRARIES
(
${
EX_STATIC
}
${
STATIC_TARGET
}
)
ENDIF
(
BUILD_TESTING_STATIC
)
ENDIF
(
BUILD_TESTING_STATIC
)
...
@@ -64,7 +67,9 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
...
@@ -64,7 +67,9 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
ADD_EXECUTABLE
(
${
EX_ROOT
}
${
EX_ROOT
}
.c Empty.cpp
)
ADD_EXECUTABLE
(
${
EX_ROOT
}
${
EX_ROOT
}
.c Empty.cpp
)
SET_TARGET_PROPERTIES
(
${
EX_ROOT
}
SET_TARGET_PROPERTIES
(
${
EX_ROOT
}
PROPERTIES
PROPERTIES
PROJECT_LABEL
"Example C -
${
EX_ROOT
}
"
)
PROJECT_LABEL
"Example C -
${
EX_ROOT
}
"
LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
)
TARGET_LINK_LIBRARIES
(
${
EX_ROOT
}
${
SHARED_TARGET
}
)
TARGET_LINK_LIBRARIES
(
${
EX_ROOT
}
${
SHARED_TARGET
}
)
ADD_DEPENDENCIES
(
${
EX_ROOT
}
ApiWrappers
)
ADD_DEPENDENCIES
(
${
EX_ROOT
}
ApiWrappers
)
ENDIF
(
BUILD_TESTING_SHARED
)
ENDIF
(
BUILD_TESTING_SHARED
)
...
@@ -77,8 +82,9 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
...
@@ -77,8 +82,9 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
ADD_EXECUTABLE
(
${
EX_STATIC
}
${
EX_ROOT
}
.c Empty.cpp
)
ADD_EXECUTABLE
(
${
EX_STATIC
}
${
EX_ROOT
}
.c Empty.cpp
)
SET_TARGET_PROPERTIES
(
${
EX_STATIC
}
SET_TARGET_PROPERTIES
(
${
EX_STATIC
}
PROPERTIES
PROPERTIES
COMPILE_FLAGS
"-DOPENMM_USE_STATIC_LIBRARIES"
PROJECT_LABEL
"Example C -
${
EX_STATIC
}
"
PROJECT_LABEL
"Example C -
${
EX_STATIC
}
"
)
LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-DOPENMM_USE_STATIC_LIBRARIES"
)
TARGET_LINK_LIBRARIES
(
${
EX_STATIC
}
${
STATIC_TARGET
}
)
TARGET_LINK_LIBRARIES
(
${
EX_STATIC
}
${
STATIC_TARGET
}
)
ADD_DEPENDENCIES
(
${
EX_STATIC
}
ApiWrappers
)
ADD_DEPENDENCIES
(
${
EX_STATIC
}
ApiWrappers
)
ENDIF
(
BUILD_TESTING_STATIC
)
ENDIF
(
BUILD_TESTING_STATIC
)
...
...
openmmapi/include/openmm/internal/vectorize.h
View file @
588a4c9d
...
@@ -91,6 +91,9 @@ public:
...
@@ -91,6 +91,9 @@ public:
fvec4
operator
&
(
fvec4
other
)
const
{
fvec4
operator
&
(
fvec4
other
)
const
{
return
_mm_and_ps
(
val
,
other
);
return
_mm_and_ps
(
val
,
other
);
}
}
fvec4
operator
|
(
fvec4
other
)
const
{
return
_mm_or_ps
(
val
,
other
);
}
fvec4
operator
==
(
fvec4
other
)
const
{
fvec4
operator
==
(
fvec4
other
)
const
{
return
_mm_cmpeq_ps
(
val
,
other
);
return
_mm_cmpeq_ps
(
val
,
other
);
}
}
...
@@ -157,6 +160,9 @@ public:
...
@@ -157,6 +160,9 @@ public:
ivec4
operator
&
(
ivec4
other
)
const
{
ivec4
operator
&
(
ivec4
other
)
const
{
return
_mm_and_si128
(
val
,
other
);
return
_mm_and_si128
(
val
,
other
);
}
}
ivec4
operator
|
(
ivec4
other
)
const
{
return
_mm_or_si128
(
val
,
other
);
}
ivec4
operator
==
(
ivec4
other
)
const
{
ivec4
operator
==
(
ivec4
other
)
const
{
return
_mm_cmpeq_epi32
(
val
,
other
);
return
_mm_cmpeq_epi32
(
val
,
other
);
}
}
...
@@ -267,5 +273,11 @@ static inline fvec4 operator/(float v1, fvec4 v2) {
...
@@ -267,5 +273,11 @@ static inline fvec4 operator/(float v1, fvec4 v2) {
return
fvec4
(
v1
)
/
v2
;
return
fvec4
(
v1
)
/
v2
;
}
}
// Operations for blending fvec4s based on an ivec4.
static
inline
fvec4
blend
(
fvec4
v1
,
fvec4
v2
,
ivec4
mask
)
{
return
fvec4
(
_mm_blendv_ps
(
v1
.
val
,
v2
.
val
,
_mm_castsi128_ps
(
mask
.
val
)));
}
#endif
/*OPENMM_VECTORIZE_H_*/
#endif
/*OPENMM_VECTORIZE_H_*/
platforms/cpu/CMakeLists.txt
View file @
588a4c9d
...
@@ -14,10 +14,6 @@
...
@@ -14,10 +14,6 @@
# libOpenMMCPU_static[_d].a
# libOpenMMCPU_static[_d].a
#----------------------------------------------------
#----------------------------------------------------
IF
(
APPLE
)
SET
(
CMAKE_OSX_DEPLOYMENT_TARGET
"10.6"
)
ENDIF
(
APPLE
)
SUBDIRS
(
tests
)
SUBDIRS
(
tests
)
# The source is organized into subdirectories, but we handle them all from
# The source is organized into subdirectories, but we handle them all from
...
...
platforms/cpu/include/AlignedArray.h
0 → 100644
View file @
588a4c9d
#ifndef OPENMM_ALIGNEDARRAY_H_
#define OPENMM_ALIGNEDARRAY_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
namespace
OpenMM
{
/**
* This class represents an array in memory whose starting point is guaranteed to
* be aligned with a 16 byte boundary. This can improve the performance of vectorized
* code, since loads and stores are more efficient.
*/
template
<
class
T
>
class
AlignedArray
{
public:
/**
* Default constructor, to allow AlignedArrays to be used inside collections.
*/
AlignedArray
()
:
dataSize
(
0
),
baseData
(
0
),
data
(
0
)
{
}
/**
* Create an Aligned array that contains a specified number of elements.
*/
AlignedArray
(
int
size
)
{
allocate
(
size
);
}
~
AlignedArray
()
{
if
(
baseData
!=
0
)
delete
[]
baseData
;
}
/**
* Get the number of elements in the array.
*/
int
size
()
const
{
return
dataSize
;
}
/**
* Change the size of the array. This may cause all contents to be lost.
*/
void
resize
(
int
size
)
{
if
(
dataSize
==
size
)
return
;
if
(
baseData
!=
0
)
delete
[]
baseData
;
allocate
(
size
);
}
/**
* Get a reference to an element of the array.
*/
T
&
operator
[](
int
i
)
{
return
data
[
i
];
}
/**
* Get a const reference to an element of the array.
*/
const
T
&
operator
[](
int
i
)
const
{
return
data
[
i
];
}
private:
void
allocate
(
int
size
)
{
dataSize
=
size
;
baseData
=
new
char
[
size
*
sizeof
(
T
)
+
16
];
char
*
offsetData
=
baseData
+
15
;
offsetData
-=
(
long
long
)
offsetData
&
0xF
;
data
=
(
T
*
)
offsetData
;
}
int
dataSize
;
char
*
baseData
;
T
*
data
;
};
}
// namespace OpenMM
#endif
/*OPENMM_ALIGNEDARRAY_H_*/
platforms/cpu/include/CpuGBSAOBCForce.h
View file @
588a4c9d
...
@@ -25,6 +25,7 @@
...
@@ -25,6 +25,7 @@
#ifndef OPENMM_CPU_GBSAOBC_FORCE_H__
#ifndef OPENMM_CPU_GBSAOBC_FORCE_H__
#define OPENMM_CPU_GBSAOBC_FORCE_H__
#define OPENMM_CPU_GBSAOBC_FORCE_H__
#include "AlignedArray.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/vectorize.h"
#include "openmm/internal/vectorize.h"
#include <set>
#include <set>
...
@@ -84,7 +85,7 @@ public:
...
@@ -84,7 +85,7 @@ public:
* @param totalEnergy total energy
* @param totalEnergy total energy
* @param threads the thread pool to use
* @param threads the thread pool to use
*/
*/
void
computeForce
(
const
std
::
vector
<
float
>&
posq
,
std
::
vector
<
std
::
vector
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
);
void
computeForce
(
const
AlignedArray
<
float
>&
posq
,
std
::
vector
<
AlignedArray
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
);
/**
/**
* This routine contains the code executed by each thread.
* This routine contains the code executed by each thread.
...
@@ -105,10 +106,13 @@ private:
...
@@ -105,10 +106,13 @@ private:
float
logDX
,
logDXInv
;
float
logDX
,
logDXInv
;
// The following variables are used to make information accessible to the individual threads.
// The following variables are used to make information accessible to the individual threads.
float
const
*
posq
;
float
const
*
posq
;
std
::
vector
<
std
::
vector
<
float
>
>*
threadForce
;
std
::
vector
<
AlignedArray
<
float
>
>*
threadForce
;
bool
includeEnergy
;
bool
includeEnergy
;
void
*
atomicCounter
;
static
const
int
NUM_TABLE_POINTS
;
static
const
int
NUM_TABLE_POINTS
;
static
const
float
TABLE_MIN
;
static
const
float
TABLE_MAX
;
/**
/**
* Compute the displacement and squared distance between a collection of points, optionally using
* Compute the displacement and squared distance between a collection of points, optionally using
...
...
platforms/cpu/include/CpuNeighborList.h
View file @
588a4c9d
...
@@ -32,6 +32,7 @@
...
@@ -32,6 +32,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
* -------------------------------------------------------------------------- */
#include "AlignedArray.h"
#include "windowsExportCpu.h"
#include "windowsExportCpu.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/ThreadPool.h"
#include <set>
#include <set>
...
@@ -46,7 +47,7 @@ public:
...
@@ -46,7 +47,7 @@ public:
class
Voxels
;
class
Voxels
;
static
const
int
BlockSize
;
static
const
int
BlockSize
;
CpuNeighborList
();
CpuNeighborList
();
void
computeNeighborList
(
int
numAtoms
,
const
std
::
vector
<
float
>&
atomLocations
,
const
std
::
vector
<
std
::
set
<
int
>
>&
exclusions
,
void
computeNeighborList
(
int
numAtoms
,
const
AlignedArray
<
float
>&
atomLocations
,
const
std
::
vector
<
std
::
set
<
int
>
>&
exclusions
,
const
float
*
periodicBoxSize
,
bool
usePeriodic
,
float
maxDistance
,
ThreadPool
&
threads
);
const
float
*
periodicBoxSize
,
bool
usePeriodic
,
float
maxDistance
,
ThreadPool
&
threads
);
int
getNumBlocks
()
const
;
int
getNumBlocks
()
const
;
const
std
::
vector
<
int
>&
getSortedAtoms
()
const
;
const
std
::
vector
<
int
>&
getSortedAtoms
()
const
;
...
...
platforms/cpu/include/CpuNonbondedForce.h
View file @
588a4c9d
...
@@ -25,6 +25,7 @@
...
@@ -25,6 +25,7 @@
#ifndef OPENMM_CPU_NONBONDED_FORCE_H__
#ifndef OPENMM_CPU_NONBONDED_FORCE_H__
#define OPENMM_CPU_NONBONDED_FORCE_H__
#define OPENMM_CPU_NONBONDED_FORCE_H__
#include "AlignedArray.h"
#include "CpuNeighborList.h"
#include "CpuNeighborList.h"
#include "ReferencePairIxn.h"
#include "ReferencePairIxn.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/ThreadPool.h"
...
@@ -143,7 +144,7 @@ class CpuNonbondedForce {
...
@@ -143,7 +144,7 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
void
calculateDirectIxn
(
int
numberOfAtoms
,
float
*
posq
,
const
std
::
vector
<
RealVec
>&
atomCoordinates
,
const
std
::
vector
<
std
::
pair
<
float
,
float
>
>&
atomParameters
,
void
calculateDirectIxn
(
int
numberOfAtoms
,
float
*
posq
,
const
std
::
vector
<
RealVec
>&
atomCoordinates
,
const
std
::
vector
<
std
::
pair
<
float
,
float
>
>&
atomParameters
,
const
std
::
vector
<
std
::
set
<
int
>
>&
exclusions
,
std
::
vector
<
std
::
vector
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
);
const
std
::
vector
<
std
::
set
<
int
>
>&
exclusions
,
std
::
vector
<
AlignedArray
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
);
/**
/**
* This routine contains the code executed by each thread.
* This routine contains the code executed by each thread.
...
@@ -156,6 +157,7 @@ private:
...
@@ -156,6 +157,7 @@ private:
bool
periodic
;
bool
periodic
;
bool
ewald
;
bool
ewald
;
bool
pme
;
bool
pme
;
bool
tableIsValid
;
const
CpuNeighborList
*
neighborList
;
const
CpuNeighborList
*
neighborList
;
float
periodicBoxSize
[
3
];
float
periodicBoxSize
[
3
];
float
cutoffDistance
,
switchingDistance
;
float
cutoffDistance
,
switchingDistance
;
...
@@ -172,8 +174,9 @@ private:
...
@@ -172,8 +174,9 @@ private:
RealVec
const
*
atomCoordinates
;
RealVec
const
*
atomCoordinates
;
std
::
pair
<
float
,
float
>
const
*
atomParameters
;
std
::
pair
<
float
,
float
>
const
*
atomParameters
;
std
::
set
<
int
>
const
*
exclusions
;
std
::
set
<
int
>
const
*
exclusions
;
std
::
vector
<
std
::
vector
<
float
>
>*
threadForce
;
std
::
vector
<
AlignedArray
<
float
>
>*
threadForce
;
bool
includeEnergy
;
bool
includeEnergy
;
void
*
atomicCounter
;
static
const
float
TWO_OVER_SQRT_PI
;
static
const
float
TWO_OVER_SQRT_PI
;
static
const
int
NUM_TABLE_POINTS
;
static
const
int
NUM_TABLE_POINTS
;
...
...
platforms/cpu/include/CpuPlatform.h
View file @
588a4c9d
...
@@ -32,6 +32,7 @@
...
@@ -32,6 +32,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
* -------------------------------------------------------------------------- */
#include "AlignedArray.h"
#include "ReferencePlatform.h"
#include "ReferencePlatform.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/ThreadPool.h"
...
@@ -69,8 +70,8 @@ private:
...
@@ -69,8 +70,8 @@ private:
class
CpuPlatform
::
PlatformData
{
class
CpuPlatform
::
PlatformData
{
public:
public:
PlatformData
(
int
numParticles
);
PlatformData
(
int
numParticles
);
std
::
vector
<
float
>
posq
;
AlignedArray
<
float
>
posq
;
std
::
vector
<
std
::
vector
<
float
>
>
threadForce
;
std
::
vector
<
AlignedArray
<
float
>
>
threadForce
;
ThreadPool
threads
;
ThreadPool
threads
;
bool
isPeriodic
;
bool
isPeriodic
;
};
};
...
...
platforms/cpu/include/CpuSETTLE.h
0 → 100644
View file @
588a4c9d
#ifndef OPENMM_CPUSETTLE_H_
#define OPENMM_CPUSETTLE_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "ReferenceSETTLEAlgorithm.h"
#include "openmm/System.h"
#include "openmm/internal/ThreadPool.h"
#include <vector>
namespace
OpenMM
{
/**
* This class uses multiple ReferenceSETTLEAlgorithm objects to execute the algorithm in parallel.
*/
class
OPENMM_EXPORT
CpuSETTLE
:
public
ReferenceConstraintAlgorithm
{
public:
class
ApplyToPositionsTask
;
class
ApplyToVelocitiesTask
;
CpuSETTLE
(
const
System
&
system
,
const
ReferenceSETTLEAlgorithm
&
settle
,
ThreadPool
&
threads
);
~
CpuSETTLE
();
/**
* Apply the constraint algorithm.
*
* @param atomCoordinates the original atom coordinates
* @param atomCoordinatesP the new atom coordinates
* @param inverseMasses 1/mass
* @param tolerance the constraint tolerance
*/
void
apply
(
std
::
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
std
::
vector
<
OpenMM
::
RealVec
>&
atomCoordinatesP
,
std
::
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
);
/**
* Apply the constraint algorithm to velocities.
*
* @param atomCoordinates the atom coordinates
* @param atomCoordinatesP the velocities to modify
* @param inverseMasses 1/mass
* @param tolerance the constraint tolerance
*/
void
applyToVelocities
(
std
::
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
std
::
vector
<
OpenMM
::
RealVec
>&
velocities
,
std
::
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
);
private:
std
::
vector
<
ReferenceSETTLEAlgorithm
*>
threadSettle
;
ThreadPool
&
threads
;
};
}
// namespace OpenMM
#endif
/*OPENMM_CPUSETTLE_H_*/
platforms/cpu/sharedTarget/CMakeLists.txt
View file @
588a4c9d
GET_PROPERTY
(
COMPILE_FLAGS GLOBAL PROPERTY COMPILE_FLAGS
)
SET_SOURCE_FILES_PROPERTIES
(
${
SOURCE_FILES
}
PROPERTIES COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-msse4.1"
)
SET_SOURCE_FILES_PROPERTIES
(
${
SOURCE_FILES
}
PROPERTIES COMPILE_FLAGS
"
${
COMPILE_FLAGS
}
-msse4.1"
)
ADD_LIBRARY
(
${
SHARED_TARGET
}
SHARED
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
ADD_LIBRARY
(
${
SHARED_TARGET
}
SHARED
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
IF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
IF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
...
@@ -8,6 +7,6 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
...
@@ -8,6 +7,6 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET
(
MAIN_OPENMM_LIB
${
OPENMM_LIBRARY_NAME
}
)
SET
(
MAIN_OPENMM_LIB
${
OPENMM_LIBRARY_NAME
}
)
ENDIF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
ENDIF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
TARGET_LINK_LIBRARIES
(
${
SHARED_TARGET
}
${
MAIN_OPENMM_LIB
}
${
PTHREADS_LIB
}
)
TARGET_LINK_LIBRARIES
(
${
SHARED_TARGET
}
${
MAIN_OPENMM_LIB
}
${
PTHREADS_LIB
}
)
SET_TARGET_PROPERTIES
(
${
SHARED_TARGET
}
PROPERTIES
COMPILE_FLAGS
"
-DOPENMM_CPU_BUILDING_SHARED_LIBRARY"
LINK_FLAGS
"
${
COMPILE_FLAGS
}
"
)
SET_TARGET_PROPERTIES
(
${
SHARED_TARGET
}
PROPERTIES
LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-DOPENMM_CPU_BUILDING_SHARED_LIBRARY"
)
INSTALL_TARGETS
(
/lib/plugins RUNTIME_DIRECTORY /lib/plugins
${
SHARED_TARGET
}
)
INSTALL_TARGETS
(
/lib/plugins RUNTIME_DIRECTORY /lib/plugins
${
SHARED_TARGET
}
)
platforms/cpu/src/CpuGBSAOBCForce.cpp
View file @
588a4c9d
...
@@ -24,14 +24,16 @@
...
@@ -24,14 +24,16 @@
#include "CpuGBSAOBCForce.h"
#include "CpuGBSAOBCForce.h"
#include "SimTKOpenMMRealType.h"
#include "SimTKOpenMMRealType.h"
#include "openmm/internal/SplineFitter.h"
#include "openmm/internal/vectorize.h"
#include "openmm/internal/vectorize.h"
#include "gmx_atomic.h"
#include <cmath>
#include <cmath>
using
namespace
std
;
using
namespace
std
;
using
namespace
OpenMM
;
using
namespace
OpenMM
;
const
int
CpuGBSAOBCForce
::
NUM_TABLE_POINTS
=
1025
;
const
int
CpuGBSAOBCForce
::
NUM_TABLE_POINTS
=
4096
;
const
float
CpuGBSAOBCForce
::
TABLE_MIN
=
0.25
f
;
const
float
CpuGBSAOBCForce
::
TABLE_MAX
=
1.5
f
;
class
CpuGBSAOBCForce
::
ComputeTask
:
public
ThreadPool
::
Task
{
class
CpuGBSAOBCForce
::
ComputeTask
:
public
ThreadPool
::
Task
{
public:
public:
...
@@ -44,22 +46,12 @@ public:
...
@@ -44,22 +46,12 @@ public:
};
};
CpuGBSAOBCForce
::
CpuGBSAOBCForce
()
:
cutoff
(
false
),
periodic
(
false
)
{
CpuGBSAOBCForce
::
CpuGBSAOBCForce
()
:
cutoff
(
false
),
periodic
(
false
)
{
logDX
=
0.5
/
NUM_TABLE_POINTS
;
logDX
=
(
TABLE_MAX
-
TABLE_MIN
)
/
NUM_TABLE_POINTS
;
logDXInv
=
1.0
f
/
logDX
;
logDXInv
=
1.0
f
/
logDX
;
vector
<
double
>
x
(
NUM_TABLE_POINTS
+
1
);
logTable
.
resize
(
NUM_TABLE_POINTS
+
4
);
vector
<
double
>
y
(
NUM_TABLE_POINTS
+
1
);
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
+
4
;
i
++
)
{
vector
<
double
>
deriv
;
double
x
=
TABLE_MIN
+
i
*
logDX
;
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
+
1
;
i
++
)
{
logTable
[
i
]
=
log
(
x
);
x
[
i
]
=
0.5
+
i
*
0.5
/
NUM_TABLE_POINTS
;
y
[
i
]
=
log
(
x
[
i
]);
}
SplineFitter
::
createNaturalSpline
(
x
,
y
,
deriv
);
logTable
.
resize
(
4
*
NUM_TABLE_POINTS
);
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
;
i
++
)
{
logTable
[
4
*
i
]
=
(
float
)
y
[
i
];
logTable
[
4
*
i
+
1
]
=
(
float
)
y
[
i
+
1
];
logTable
[
4
*
i
+
2
]
=
(
float
)
(
deriv
[
i
]
*
logDX
*
logDX
/
6
);
logTable
[
4
*
i
+
3
]
=
(
float
)
(
deriv
[
i
+
1
]
*
logDX
*
logDX
/
6
);
}
}
}
}
...
@@ -93,7 +85,7 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f
...
@@ -93,7 +85,7 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f
obcChain
.
resize
(
params
.
size
()
+
3
);
obcChain
.
resize
(
params
.
size
()
+
3
);
}
}
void
CpuGBSAOBCForce
::
computeForce
(
const
std
::
vector
<
float
>&
posq
,
vector
<
vector
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
)
{
void
CpuGBSAOBCForce
::
computeForce
(
const
AlignedArray
<
float
>&
posq
,
vector
<
AlignedArray
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
)
{
// Record the parameters for the threads.
// Record the parameters for the threads.
this
->
posq
=
&
posq
[
0
];
this
->
posq
=
&
posq
[
0
];
...
@@ -104,16 +96,22 @@ void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector
...
@@ -104,16 +96,22 @@ void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector
threadBornForces
.
resize
(
numThreads
);
threadBornForces
.
resize
(
numThreads
);
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
threadBornForces
[
i
].
resize
(
particleParams
.
size
()
+
3
);
threadBornForces
[
i
].
resize
(
particleParams
.
size
()
+
3
);
gmx_atomic_t
counter
;
this
->
atomicCounter
=
&
counter
;
// Signal the threads to start running and wait for them to finish.
// Signal the threads to start running and wait for them to finish.
ComputeTask
task
(
*
this
);
ComputeTask
task
(
*
this
);
gmx_atomic_set
(
&
counter
,
0
);
threads
.
execute
(
task
);
threads
.
execute
(
task
);
threads
.
waitForThreads
();
// Compute Born radii
threads
.
waitForThreads
();
// Compute Born radii
gmx_atomic_set
(
&
counter
,
0
);
threads
.
resumeThreads
();
threads
.
resumeThreads
();
threads
.
waitForThreads
();
// Compute surface area term
threads
.
waitForThreads
();
// Compute surface area term
gmx_atomic_set
(
&
counter
,
0
);
threads
.
resumeThreads
();
threads
.
resumeThreads
();
threads
.
waitForThreads
();
// First loop
threads
.
waitForThreads
();
// First loop
gmx_atomic_set
(
&
counter
,
0
);
threads
.
resumeThreads
();
threads
.
resumeThreads
();
threads
.
waitForThreads
();
// Second loop
threads
.
waitForThreads
();
// Second loop
...
@@ -141,8 +139,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
...
@@ -141,8 +139,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
// Calculate Born radii
// Calculate Born radii
for
(
int
blockStart
=
start
;
blockStart
<
end
;
blockStart
+=
4
)
{
while
(
true
)
{
int
numInBlock
=
min
(
4
,
end
-
blockStart
);
int
blockStart
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
4
);
if
(
blockStart
>=
numParticles
)
break
;
int
numInBlock
=
min
(
4
,
numParticles
-
blockStart
);
ivec4
blockAtomIndex
(
blockStart
,
blockStart
+
1
,
blockStart
+
2
,
blockStart
+
3
);
ivec4
blockAtomIndex
(
blockStart
,
blockStart
+
1
,
blockStart
+
2
,
blockStart
+
3
);
float
atomRadius
[
4
],
atomx
[
4
],
atomy
[
4
],
atomz
[
4
];
float
atomRadius
[
4
],
atomx
[
4
],
atomy
[
4
],
atomz
[
4
];
int
blockMask
[
4
]
=
{
0
,
0
,
0
,
0
};
int
blockMask
[
4
]
=
{
0
,
0
,
0
,
0
};
...
@@ -213,7 +214,10 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
...
@@ -213,7 +214,10 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
vector
<
float
>&
bornForces
=
threadBornForces
[
threadIndex
];
vector
<
float
>&
bornForces
=
threadBornForces
[
threadIndex
];
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
bornForces
[
i
]
=
0.0
f
;
bornForces
[
i
]
=
0.0
f
;
for
(
int
atomI
=
start
;
atomI
<
end
;
atomI
++
)
{
while
(
true
)
{
int
atomI
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
1
);
if
(
atomI
>=
numParticles
)
break
;
if
(
bornRadii
[
atomI
]
>
0
)
{
if
(
bornRadii
[
atomI
]
>
0
)
{
float
radiusI
=
particleParams
[
atomI
].
first
+
dielectricOffset
;
float
radiusI
=
particleParams
[
atomI
].
first
+
dielectricOffset
;
float
r
=
radiusI
+
probeRadius
;
float
r
=
radiusI
+
probeRadius
;
...
@@ -235,8 +239,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
...
@@ -235,8 +239,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
preFactor
=
ONE_4PI_EPS0
*
((
1.0
f
/
solventDielectric
)
-
(
1.0
f
/
soluteDielectric
));
preFactor
=
ONE_4PI_EPS0
*
((
1.0
f
/
solventDielectric
)
-
(
1.0
f
/
soluteDielectric
));
else
else
preFactor
=
0.0
f
;
preFactor
=
0.0
f
;
for
(
int
blockStart
=
start
;
blockStart
<
end
;
blockStart
+=
4
)
{
while
(
true
)
{
int
numInBlock
=
min
(
4
,
end
-
blockStart
);
int
blockStart
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
4
);
if
(
blockStart
>=
numParticles
)
break
;
int
numInBlock
=
min
(
4
,
numParticles
-
blockStart
);
ivec4
blockAtomIndex
(
blockStart
,
blockStart
+
1
,
blockStart
+
2
,
blockStart
+
3
);
ivec4
blockAtomIndex
(
blockStart
,
blockStart
+
1
,
blockStart
+
2
,
blockStart
+
3
);
float
atomCharge
[
4
],
atomx
[
4
],
atomy
[
4
],
atomz
[
4
];
float
atomCharge
[
4
],
atomx
[
4
],
atomy
[
4
],
atomz
[
4
];
int
blockMask
[
4
]
=
{
0
,
0
,
0
,
0
};
int
blockMask
[
4
]
=
{
0
,
0
,
0
,
0
};
...
@@ -303,13 +310,16 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
...
@@ -303,13 +310,16 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
// Second loop of Born energy computation.
// Second loop of Born energy computation.
for
(
int
blockStart
=
start
;
blockStart
<
end
;
blockStart
+=
4
)
{
while
(
true
)
{
int
blockStart
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
4
);
if
(
blockStart
>=
numParticles
)
break
;
fvec4
bornForce
(
0.0
f
);
fvec4
bornForce
(
0.0
f
);
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
bornForce
+=
fvec4
(
&
threadBornForces
[
i
][
blockStart
]);
bornForce
+=
fvec4
(
&
threadBornForces
[
i
][
blockStart
]);
fvec4
radii
(
&
bornRadii
[
blockStart
]);
fvec4
radii
(
&
bornRadii
[
blockStart
]);
bornForce
*=
radii
*
radii
*
fvec4
(
&
obcChain
[
blockStart
]);
bornForce
*=
radii
*
radii
*
fvec4
(
&
obcChain
[
blockStart
]);
int
numInBlock
=
min
(
4
,
end
-
blockStart
);
int
numInBlock
=
min
(
4
,
numParticles
-
blockStart
);
ivec4
blockAtomIndex
(
blockStart
,
blockStart
+
1
,
blockStart
+
2
,
blockStart
+
3
);
ivec4
blockAtomIndex
(
blockStart
,
blockStart
+
1
,
blockStart
+
2
,
blockStart
+
3
);
float
atomRadius
[
4
],
atomx
[
4
],
atomy
[
4
],
atomz
[
4
];
float
atomRadius
[
4
],
atomx
[
4
],
atomy
[
4
],
atomz
[
4
];
int
blockMask
[
4
]
=
{
0
,
0
,
0
,
0
};
int
blockMask
[
4
]
=
{
0
,
0
,
0
,
0
};
...
@@ -351,15 +361,14 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
...
@@ -351,15 +361,14 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
fvec4
logRatio
=
fastLog
(
u_ij
/
l_ij
);
fvec4
logRatio
=
fastLog
(
u_ij
/
l_ij
);
fvec4
t3
=
0.125
f
*
(
1.0
f
+
scaledRadiusJ2
*
r2Inverse
)
*
(
l_ij2
-
u_ij2
)
+
0.25
f
*
logRatio
*
r2Inverse
;
fvec4
t3
=
0.125
f
*
(
1.0
f
+
scaledRadiusJ2
*
r2Inverse
)
*
(
l_ij2
-
u_ij2
)
+
0.25
f
*
logRatio
*
r2Inverse
;
fvec4
de
=
bornForce
*
t3
*
rInverse
;
fvec4
de
=
bornForce
*
t3
*
rInverse
;
de
=
blend
(
0.0
f
,
de
,
include
);
fvec4
result
[
4
]
=
{
dx
*
de
,
dy
*
de
,
dz
*
de
,
0.0
f
};
fvec4
result
[
4
]
=
{
dx
*
de
,
dy
*
de
,
dz
*
de
,
0.0
f
};
transpose
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
transpose
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
fvec4
atomForce
(
forces
+
4
*
atomJ
);
fvec4
atomForce
(
forces
+
4
*
atomJ
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
if
(
include
[
j
])
{
blockAtomForce
[
j
]
+=
result
[
j
];
blockAtomForce
[
j
]
+=
result
[
j
];
atomForce
-=
result
[
j
];
atomForce
-=
result
[
j
];
}
}
}
atomForce
.
store
(
forces
+
4
*
atomJ
);
atomForce
.
store
(
forces
+
4
*
atomJ
);
}
}
for
(
int
i
=
0
;
i
<
numInBlock
;
i
++
)
{
for
(
int
i
=
0
;
i
<
numInBlock
;
i
++
)
{
...
@@ -385,21 +394,16 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
...
@@ -385,21 +394,16 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
fvec4
CpuGBSAOBCForce
::
fastLog
(
fvec4
x
)
{
fvec4
CpuGBSAOBCForce
::
fastLog
(
fvec4
x
)
{
// Evaluate log(x) using a lookup table for speed.
// Evaluate log(x) using a lookup table for speed.
float
y
[
4
];
if
(
any
((
x
<
TABLE_MIN
)
|
(
x
>=
TABLE_MAX
)))
fvec4
x1
=
(
x
-
0.5
f
)
*
logDXInv
;
return
fvec4
(
logf
(
x
[
0
]),
logf
(
x
[
1
]),
logf
(
x
[
2
]),
logf
(
x
[
3
]));
fvec4
x1
=
(
x
-
TABLE_MIN
)
*
logDXInv
;
ivec4
index
=
floor
(
x1
);
ivec4
index
=
floor
(
x1
);
fvec4
coeff
[
4
];
fvec4
coeff2
=
x1
-
index
;
coeff
[
1
]
=
x1
-
index
;
fvec4
coeff1
=
1.0
f
-
coeff2
;
coeff
[
0
]
=
1.0
f
-
coeff
[
1
];
fvec4
t1
(
&
logTable
[
index
[
0
]]);
coeff
[
2
]
=
coeff
[
0
]
*
coeff
[
0
]
*
coeff
[
0
]
-
coeff
[
0
];
fvec4
t2
(
&
logTable
[
index
[
1
]]);
coeff
[
3
]
=
coeff
[
1
]
*
coeff
[
1
]
*
coeff
[
1
]
-
coeff
[
1
];
fvec4
t3
(
&
logTable
[
index
[
2
]]);
transpose
(
coeff
[
0
],
coeff
[
1
],
coeff
[
2
],
coeff
[
3
]);
fvec4
t4
(
&
logTable
[
index
[
3
]]);
static
float
maxdiff
=
0.0
f
;
transpose
(
t1
,
t2
,
t3
,
t4
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
return
coeff1
*
t1
+
coeff2
*
t2
;
if
(
index
[
i
]
>=
0
&&
index
[
i
]
<
NUM_TABLE_POINTS
)
y
[
i
]
=
dot4
(
coeff
[
i
],
fvec4
(
&
logTable
[
4
*
index
[
i
]]));
else
y
[
i
]
=
logf
(
x
[
i
]);
}
return
fvec4
(
y
);
}
}
platforms/cpu/src/CpuKernels.cpp
View file @
588a4c9d
...
@@ -81,16 +81,16 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
...
@@ -81,16 +81,16 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
// Convert the positions to single precision and apply periodic boundary conditions
// Convert the positions to single precision and apply periodic boundary conditions
vector
<
float
>&
posq
=
data
.
posq
;
AlignedArray
<
float
>&
posq
=
data
.
posq
;
vector
<
RealVec
>&
posData
=
extractPositions
(
context
);
vector
<
RealVec
>&
posData
=
extractPositions
(
context
);
RealVec
boxSize
=
extractBoxSize
(
context
);
RealVec
boxSize
=
extractBoxSize
(
context
);
float
float
BoxSize
[
3
]
=
{
(
float
)
boxSize
[
0
],
(
float
)
boxSize
[
1
],
(
float
)
boxSize
[
2
]};
double
inv
BoxSize
[
3
]
=
{
1
/
boxSize
[
0
],
1
/
boxSize
[
1
],
1
/
boxSize
[
2
]};
int
numParticles
=
context
.
getSystem
().
getNumParticles
();
int
numParticles
=
context
.
getSystem
().
getNumParticles
();
if
(
data
.
isPeriodic
)
if
(
data
.
isPeriodic
)
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
for
(
int
j
=
0
;
j
<
3
;
j
++
)
{
for
(
int
j
=
0
;
j
<
3
;
j
++
)
{
RealOpenMM
x
=
posData
[
i
][
j
];
RealOpenMM
x
=
posData
[
i
][
j
];
double
base
=
floor
(
x
/
b
oxSize
[
j
])
*
boxSize
[
j
];
double
base
=
floor
(
x
*
invB
oxSize
[
j
])
*
boxSize
[
j
];
posq
[
4
*
i
+
j
]
=
(
float
)
(
x
-
base
);
posq
[
4
*
i
+
j
]
=
(
float
)
(
x
-
base
);
}
}
else
else
...
@@ -255,12 +255,12 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
...
@@ -255,12 +255,12 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
}
}
}
}
}
}
vector
<
float
>&
posq
=
data
.
posq
;
AlignedArray
<
float
>&
posq
=
data
.
posq
;
vector
<
RealVec
>&
posData
=
extractPositions
(
context
);
vector
<
RealVec
>&
posData
=
extractPositions
(
context
);
vector
<
RealVec
>&
forceData
=
extractForces
(
context
);
vector
<
RealVec
>&
forceData
=
extractForces
(
context
);
RealVec
boxSize
=
extractBoxSize
(
context
);
RealVec
boxSize
=
extractBoxSize
(
context
);
float
floatBoxSize
[
3
]
=
{(
float
)
boxSize
[
0
],
(
float
)
boxSize
[
1
],
(
float
)
boxSize
[
2
]};
float
floatBoxSize
[
3
]
=
{(
float
)
boxSize
[
0
],
(
float
)
boxSize
[
1
],
(
float
)
boxSize
[
2
]};
double
energy
=
ewaldSelfEnergy
;
double
energy
=
(
includeReciprocal
?
ewaldSelfEnergy
:
0.0
)
;
bool
ewald
=
(
nonbondedMethod
==
Ewald
);
bool
ewald
=
(
nonbondedMethod
==
Ewald
);
bool
pme
=
(
nonbondedMethod
==
PME
);
bool
pme
=
(
nonbondedMethod
==
PME
);
if
(
nonbondedMethod
!=
NoCutoff
)
{
if
(
nonbondedMethod
!=
NoCutoff
)
{
...
@@ -330,7 +330,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
...
@@ -330,7 +330,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
PmeIO
io
(
&
posq
[
0
],
&
data
.
threadForce
[
0
][
0
],
numParticles
);
PmeIO
io
(
&
posq
[
0
],
&
data
.
threadForce
[
0
][
0
],
numParticles
);
Vec3
periodicBoxSize
(
boxSize
[
0
],
boxSize
[
1
],
boxSize
[
2
]);
Vec3
periodicBoxSize
(
boxSize
[
0
],
boxSize
[
1
],
boxSize
[
2
]);
optimizedPme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
beginComputation
(
io
,
periodicBoxSize
,
includeEnergy
);
optimizedPme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
beginComputation
(
io
,
periodicBoxSize
,
includeEnergy
);
optimizedPme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
finishComputation
(
io
);
nonbondedEnergy
+=
optimizedPme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
finishComputation
(
io
);
}
}
else
else
nonbonded
.
calculateReciprocalIxn
(
numParticles
,
&
posq
[
0
],
posData
,
particleParams
,
exclusions
,
forceData
,
includeEnergy
?
&
nonbondedEnergy
:
NULL
);
nonbonded
.
calculateReciprocalIxn
(
numParticles
,
&
posq
[
0
],
posData
,
particleParams
,
exclusions
,
forceData
,
includeEnergy
?
&
nonbondedEnergy
:
NULL
);
...
...
platforms/cpu/src/CpuNeighborList.cpp
View file @
588a4c9d
...
@@ -48,6 +48,8 @@ const int CpuNeighborList::BlockSize = 4;
...
@@ -48,6 +48,8 @@ const int CpuNeighborList::BlockSize = 4;
class
VoxelIndex
class
VoxelIndex
{
{
public:
public:
VoxelIndex
()
:
x
(
0
),
y
(
0
)
{
}
VoxelIndex
(
int
x
,
int
y
)
:
x
(
x
),
y
(
y
)
{
VoxelIndex
(
int
x
,
int
y
)
:
x
(
x
),
y
(
y
)
{
}
}
int
x
;
int
x
;
...
@@ -173,6 +175,9 @@ public:
...
@@ -173,6 +175,9 @@ public:
float
centerPos
[
4
];
float
centerPos
[
4
];
blockCenter
.
store
(
centerPos
);
blockCenter
.
store
(
centerPos
);
VoxelIndex
centerVoxelIndex
=
getVoxelIndex
(
centerPos
);
VoxelIndex
centerVoxelIndex
=
getVoxelIndex
(
centerPos
);
VoxelIndex
atomVoxelIndex
[
BlockSize
];
for
(
int
i
=
0
;
i
<
(
int
)
blockAtoms
.
size
();
i
++
)
atomVoxelIndex
[
i
]
=
getVoxelIndex
(
&
atomLocations
[
4
*
blockAtoms
[
i
]]);
int
startx
=
centerVoxelIndex
.
x
-
dIndexX
;
int
startx
=
centerVoxelIndex
.
x
-
dIndexX
;
int
starty
=
centerVoxelIndex
.
y
-
dIndexY
;
int
starty
=
centerVoxelIndex
.
y
-
dIndexY
;
int
endx
=
centerVoxelIndex
.
x
+
dIndexX
;
int
endx
=
centerVoxelIndex
.
x
+
dIndexX
;
...
@@ -194,39 +199,59 @@ public:
...
@@ -194,39 +199,59 @@ public:
voxelIndex
.
x
=
x
;
voxelIndex
.
x
=
x
;
if
(
usePeriodic
)
if
(
usePeriodic
)
voxelIndex
.
x
=
(
x
<
0
?
x
+
nx
:
(
x
>=
nx
?
x
-
nx
:
x
));
voxelIndex
.
x
=
(
x
<
0
?
x
+
nx
:
(
x
>=
nx
?
x
-
nx
:
x
));
float
dx
=
max
(
0.0
f
,
voxelSizeX
*
max
(
0
,
abs
(
centerVoxelIndex
.
x
-
x
)
-
1
)
-
blockWidth
[
0
]);
for
(
int
y
=
starty
;
y
<=
endy
;
++
y
)
{
for
(
int
y
=
starty
;
y
<=
endy
;
++
y
)
{
voxelIndex
.
y
=
y
;
voxelIndex
.
y
=
y
;
if
(
usePeriodic
)
if
(
usePeriodic
)
voxelIndex
.
y
=
(
y
<
0
?
y
+
ny
:
(
y
>=
ny
?
y
-
ny
:
y
));
voxelIndex
.
y
=
(
y
<
0
?
y
+
ny
:
(
y
>=
ny
?
y
-
ny
:
y
));
float
dy
=
max
(
0.0
f
,
voxelSizeY
*
max
(
0
,
abs
(
centerVoxelIndex
.
y
-
y
)
-
1
)
-
blockWidth
[
1
]);
// Identify the range of atoms within this bin we need to search. When using periodic boundary
// Identify the range of atoms within this bin we need to search. When using periodic boundary
// conditions, there may be two separate ranges.
// conditions, there may be two separate ranges.
float
dz
=
maxDistance
+
blockWidth
[
2
];
float
minz
=
centerPos
[
2
];
dz
=
sqrtf
(
max
(
0.0
f
,
dz
*
dz
-
dx
*
dx
-
dy
*
dy
));
float
maxz
=
centerPos
[
2
];
fvec4
offset
(
voxelSizeX
*
x
+
(
usePeriodic
?
0.0
f
:
minx
),
voxelSizeY
*
y
+
(
usePeriodic
?
0.0
f
:
miny
),
0
,
0
);
for
(
int
k
=
0
;
k
<
(
int
)
blockAtoms
.
size
();
k
++
)
{
const
float
*
atomPos
=
&
atomLocations
[
4
*
blockAtoms
[
k
]];
fvec4
posVec
(
atomPos
);
fvec4
delta1
=
offset
-
posVec
;
fvec4
delta2
=
delta1
+
fvec4
(
voxelSizeX
,
voxelSizeY
,
0
,
0
);
if
(
usePeriodic
)
{
delta1
-=
round
(
delta1
*
invBoxSize
)
*
boxSize
;
delta2
-=
round
(
delta2
*
invBoxSize
)
*
boxSize
;
}
fvec4
delta
=
min
(
abs
(
delta1
),
abs
(
delta2
));
float
dx
=
(
x
==
atomVoxelIndex
[
k
].
x
?
0.0
f
:
delta
[
0
]);
float
dy
=
(
y
==
atomVoxelIndex
[
k
].
y
?
0.0
f
:
delta
[
1
]);
float
dist2
=
maxDistanceSquared
-
dx
*
dx
-
dy
*
dy
;
if
(
dist2
>
0
)
{
float
dist
=
sqrtf
(
dist2
);
minz
=
min
(
minz
,
atomPos
[
2
]
-
dist
);
maxz
=
max
(
maxz
,
atomPos
[
2
]
+
dist
);
}
}
if
(
minz
==
maxz
)
continue
;
bool
needPeriodic
=
(
centerPos
[
0
]
-
blockWidth
[
0
]
<
maxDistance
||
centerPos
[
0
]
+
blockWidth
[
0
]
>
periodicBoxSize
[
0
]
-
maxDistance
||
bool
needPeriodic
=
(
centerPos
[
0
]
-
blockWidth
[
0
]
<
maxDistance
||
centerPos
[
0
]
+
blockWidth
[
0
]
>
periodicBoxSize
[
0
]
-
maxDistance
||
centerPos
[
1
]
-
blockWidth
[
1
]
<
maxDistance
||
centerPos
[
1
]
+
blockWidth
[
1
]
>
periodicBoxSize
[
1
]
-
maxDistance
||
centerPos
[
1
]
-
blockWidth
[
1
]
<
maxDistance
||
centerPos
[
1
]
+
blockWidth
[
1
]
>
periodicBoxSize
[
1
]
-
maxDistance
||
centerPos
[
2
]
-
dz
<
0.0
f
||
centerPos
[
2
]
+
d
z
>
periodicBoxSize
[
2
]);
minz
<
0.0
f
||
max
z
>
periodicBoxSize
[
2
]);
int
rangeStart
[
2
];
int
rangeStart
[
2
];
int
rangeEnd
[
2
];
int
rangeEnd
[
2
];
rangeStart
[
0
]
=
findLowerBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
-
d
z
);
rangeStart
[
0
]
=
findLowerBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
min
z
);
if
(
needPeriodic
)
{
if
(
needPeriodic
)
{
numRanges
=
2
;
numRanges
=
2
;
rangeEnd
[
0
]
=
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
+
d
z
);
rangeEnd
[
0
]
=
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
max
z
);
if
(
rangeStart
[
0
]
>
0
)
{
if
(
rangeStart
[
0
]
>
0
)
{
rangeStart
[
1
]
=
0
;
rangeStart
[
1
]
=
0
;
rangeEnd
[
1
]
=
min
(
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
+
d
z
-
periodicBoxSize
[
2
]),
rangeStart
[
0
]);
rangeEnd
[
1
]
=
min
(
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
max
z
-
periodicBoxSize
[
2
]),
rangeStart
[
0
]);
}
}
else
{
else
{
rangeStart
[
1
]
=
max
(
findLowerBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
-
d
z
+
periodicBoxSize
[
2
]),
rangeEnd
[
0
]);
rangeStart
[
1
]
=
max
(
findLowerBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
min
z
+
periodicBoxSize
[
2
]),
rangeEnd
[
0
]);
rangeEnd
[
1
]
=
bins
[
voxelIndex
.
x
][
voxelIndex
.
y
].
size
();
rangeEnd
[
1
]
=
bins
[
voxelIndex
.
x
][
voxelIndex
.
y
].
size
();
}
}
}
}
else
{
else
{
numRanges
=
1
;
numRanges
=
1
;
rangeEnd
[
0
]
=
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
+
d
z
);
rangeEnd
[
0
]
=
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
max
z
);
}
}
// Loop over atoms and check to see if they are neighbors of this block.
// Loop over atoms and check to see if they are neighbors of this block.
...
@@ -307,7 +332,7 @@ public:
...
@@ -307,7 +332,7 @@ public:
CpuNeighborList
::
CpuNeighborList
()
{
CpuNeighborList
::
CpuNeighborList
()
{
}
}
void
CpuNeighborList
::
computeNeighborList
(
int
numAtoms
,
const
vector
<
float
>&
atomLocations
,
const
vector
<
set
<
int
>
>&
exclusions
,
void
CpuNeighborList
::
computeNeighborList
(
int
numAtoms
,
const
AlignedArray
<
float
>&
atomLocations
,
const
vector
<
set
<
int
>
>&
exclusions
,
const
float
*
periodicBoxSize
,
bool
usePeriodic
,
float
maxDistance
,
ThreadPool
&
threads
)
{
const
float
*
periodicBoxSize
,
bool
usePeriodic
,
float
maxDistance
,
ThreadPool
&
threads
)
{
int
numBlocks
=
(
numAtoms
+
BlockSize
-
1
)
/
BlockSize
;
int
numBlocks
=
(
numAtoms
+
BlockSize
-
1
)
/
BlockSize
;
blockNeighbors
.
resize
(
numBlocks
);
blockNeighbors
.
resize
(
numBlocks
);
...
@@ -353,8 +378,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
...
@@ -353,8 +378,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
if
(
!
usePeriodic
)
if
(
!
usePeriodic
)
edgeSizeX
=
edgeSizeY
=
maxDistance
;
// TODO - adjust this as needed
edgeSizeX
=
edgeSizeY
=
maxDistance
;
// TODO - adjust this as needed
else
{
else
{
edgeSizeX
=
0.
5
f
*
periodicBoxSize
[
0
]
/
floorf
(
periodicBoxSize
[
0
]
/
maxDistance
);
edgeSizeX
=
0.
6
f
*
periodicBoxSize
[
0
]
/
floorf
(
periodicBoxSize
[
0
]
/
maxDistance
);
edgeSizeY
=
0.
5
f
*
periodicBoxSize
[
1
]
/
floorf
(
periodicBoxSize
[
1
]
/
maxDistance
);
edgeSizeY
=
0.
6
f
*
periodicBoxSize
[
1
]
/
floorf
(
periodicBoxSize
[
1
]
/
maxDistance
);
}
}
Voxels
voxels
(
edgeSizeX
,
edgeSizeY
,
minx
,
maxx
,
miny
,
maxy
,
periodicBoxSize
,
usePeriodic
);
Voxels
voxels
(
edgeSizeX
,
edgeSizeY
,
minx
,
maxx
,
miny
,
maxy
,
periodicBoxSize
,
usePeriodic
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
...
...
platforms/cpu/src/CpuNonbondedForce.cpp
View file @
588a4c9d
...
@@ -29,8 +29,8 @@
...
@@ -29,8 +29,8 @@
#include "CpuNonbondedForce.h"
#include "CpuNonbondedForce.h"
#include "ReferenceForce.h"
#include "ReferenceForce.h"
#include "ReferencePME.h"
#include "ReferencePME.h"
#include "openmm/internal/SplineFitter.h"
#include "openmm/internal/vectorize.h"
#include "openmm/internal/vectorize.h"
#include "gmx_atomic.h"
// In case we're using some primitive version of Visual Studio this will
// In case we're using some primitive version of Visual Studio this will
// make sure that erf() and erfc() are defined.
// make sure that erf() and erfc() are defined.
...
@@ -40,7 +40,7 @@ using namespace std;
...
@@ -40,7 +40,7 @@ using namespace std;
using
namespace
OpenMM
;
using
namespace
OpenMM
;
const
float
CpuNonbondedForce
::
TWO_OVER_SQRT_PI
=
(
float
)
(
2
/
sqrt
(
PI_M
));
const
float
CpuNonbondedForce
::
TWO_OVER_SQRT_PI
=
(
float
)
(
2
/
sqrt
(
PI_M
));
const
int
CpuNonbondedForce
::
NUM_TABLE_POINTS
=
1025
;
const
int
CpuNonbondedForce
::
NUM_TABLE_POINTS
=
2048
;
class
CpuNonbondedForce
::
ComputeDirectTask
:
public
ThreadPool
::
Task
{
class
CpuNonbondedForce
::
ComputeDirectTask
:
public
ThreadPool
::
Task
{
public:
public:
...
@@ -58,10 +58,10 @@ public:
...
@@ -58,10 +58,10 @@ public:
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
CpuNonbondedForce
::
CpuNonbondedForce
()
:
cutoff
(
false
),
useSwitch
(
false
),
periodic
(
false
),
ewald
(
false
),
pme
(
false
)
{
CpuNonbondedForce
::
CpuNonbondedForce
()
:
cutoff
(
false
),
useSwitch
(
false
),
periodic
(
false
),
ewald
(
false
),
pme
(
false
)
,
tableIsValid
(
false
)
{
}
}
/**---------------------------------------------------------------------------------------
/**---------------------------------------------------------------------------------------
Set the force to use a cutoff.
Set the force to use a cutoff.
...
@@ -71,8 +71,9 @@ CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), period
...
@@ -71,8 +71,9 @@ CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), period
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
void
CpuNonbondedForce
::
setUseCutoff
(
float
distance
,
const
CpuNeighborList
&
neighbors
,
float
solventDielectric
)
{
void
CpuNonbondedForce
::
setUseCutoff
(
float
distance
,
const
CpuNeighborList
&
neighbors
,
float
solventDielectric
)
{
if
(
distance
!=
cutoffDistance
)
tableIsValid
=
false
;
cutoff
=
true
;
cutoff
=
true
;
cutoffDistance
=
distance
;
cutoffDistance
=
distance
;
neighborList
=
&
neighbors
;
neighborList
=
&
neighbors
;
...
@@ -127,6 +128,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
...
@@ -127,6 +128,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
void
CpuNonbondedForce
::
setUseEwald
(
float
alpha
,
int
kmaxx
,
int
kmaxy
,
int
kmaxz
)
{
void
CpuNonbondedForce
::
setUseEwald
(
float
alpha
,
int
kmaxx
,
int
kmaxy
,
int
kmaxz
)
{
if
(
alpha
!=
alphaEwald
)
tableIsValid
=
false
;
alphaEwald
=
alpha
;
alphaEwald
=
alpha
;
numRx
=
kmaxx
;
numRx
=
kmaxx
;
numRy
=
kmaxy
;
numRy
=
kmaxy
;
...
@@ -145,6 +148,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
...
@@ -145,6 +148,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
void
CpuNonbondedForce
::
setUsePME
(
float
alpha
,
int
meshSize
[
3
])
{
void
CpuNonbondedForce
::
setUsePME
(
float
alpha
,
int
meshSize
[
3
])
{
if
(
alpha
!=
alphaEwald
)
tableIsValid
=
false
;
alphaEwald
=
alpha
;
alphaEwald
=
alpha
;
meshDim
[
0
]
=
meshSize
[
0
];
meshDim
[
0
]
=
meshSize
[
0
];
meshDim
[
1
]
=
meshSize
[
1
];
meshDim
[
1
]
=
meshSize
[
1
];
...
@@ -155,24 +160,16 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
...
@@ -155,24 +160,16 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
void
CpuNonbondedForce
::
tabulateEwaldScaleFactor
()
{
void
CpuNonbondedForce
::
tabulateEwaldScaleFactor
()
{
ewaldDX
=
cutoffDistance
/
(
NUM_TABLE_POINTS
-
2
);
if
(
tableIsValid
)
return
;
tableIsValid
=
true
;
ewaldDX
=
cutoffDistance
/
NUM_TABLE_POINTS
;
ewaldDXInv
=
1.0
f
/
ewaldDX
;
ewaldDXInv
=
1.0
f
/
ewaldDX
;
vector
<
double
>
x
(
NUM_TABLE_POINTS
+
1
);
ewaldScaleTable
.
resize
(
NUM_TABLE_POINTS
+
4
);
vector
<
double
>
y
(
NUM_TABLE_POINTS
+
1
);
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
+
4
;
i
++
)
{
vector
<
double
>
deriv
;
double
r
=
i
*
ewaldDX
;
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
+
1
;
i
++
)
{
double
r
=
i
*
cutoffDistance
/
(
NUM_TABLE_POINTS
-
2
);
double
alphaR
=
alphaEwald
*
r
;
double
alphaR
=
alphaEwald
*
r
;
x
[
i
]
=
r
;
ewaldScaleTable
[
i
]
=
erfc
(
alphaR
)
+
TWO_OVER_SQRT_PI
*
alphaR
*
exp
(
-
alphaR
*
alphaR
);
y
[
i
]
=
erfc
(
alphaR
)
+
TWO_OVER_SQRT_PI
*
alphaR
*
exp
(
-
alphaR
*
alphaR
);
}
SplineFitter
::
createNaturalSpline
(
x
,
y
,
deriv
);
ewaldScaleTable
.
resize
(
4
*
NUM_TABLE_POINTS
);
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
;
i
++
)
{
ewaldScaleTable
[
4
*
i
]
=
(
float
)
y
[
i
];
ewaldScaleTable
[
4
*
i
+
1
]
=
(
float
)
y
[
i
+
1
];
ewaldScaleTable
[
4
*
i
+
2
]
=
(
float
)
(
deriv
[
i
]
*
ewaldDX
*
ewaldDX
/
6
);
ewaldScaleTable
[
4
*
i
+
3
]
=
(
float
)
(
deriv
[
i
+
1
]
*
ewaldDX
*
ewaldDX
/
6
);
}
}
}
}
...
@@ -291,7 +288,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
...
@@ -291,7 +288,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
void
CpuNonbondedForce
::
calculateDirectIxn
(
int
numberOfAtoms
,
float
*
posq
,
const
vector
<
RealVec
>&
atomCoordinates
,
const
vector
<
pair
<
float
,
float
>
>&
atomParameters
,
void
CpuNonbondedForce
::
calculateDirectIxn
(
int
numberOfAtoms
,
float
*
posq
,
const
vector
<
RealVec
>&
atomCoordinates
,
const
vector
<
pair
<
float
,
float
>
>&
atomParameters
,
const
vector
<
set
<
int
>
>&
exclusions
,
vector
<
vector
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
)
{
const
vector
<
set
<
int
>
>&
exclusions
,
vector
<
AlignedArray
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
)
{
// Record the parameters for the threads.
// Record the parameters for the threads.
this
->
numberOfAtoms
=
numberOfAtoms
;
this
->
numberOfAtoms
=
numberOfAtoms
;
...
@@ -302,6 +299,9 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
...
@@ -302,6 +299,9 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
this
->
threadForce
=
&
threadForce
;
this
->
threadForce
=
&
threadForce
;
includeEnergy
=
(
totalEnergy
!=
NULL
);
includeEnergy
=
(
totalEnergy
!=
NULL
);
threadEnergy
.
resize
(
threads
.
getNumThreads
());
threadEnergy
.
resize
(
threads
.
getNumThreads
());
gmx_atomic_t
counter
;
gmx_atomic_set
(
&
counter
,
0
);
this
->
atomicCounter
=
&
counter
;
// Signal the threads to start running and wait for them to finish.
// Signal the threads to start running and wait for them to finish.
...
@@ -332,8 +332,12 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
...
@@ -332,8 +332,12 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
if
(
ewald
||
pme
)
{
if
(
ewald
||
pme
)
{
// Compute the interactions from the neighbor list.
// Compute the interactions from the neighbor list.
for
(
int
i
=
threadIndex
;
i
<
neighborList
->
getNumBlocks
();
i
+=
numThreads
)
while
(
true
)
{
calculateBlockEwaldIxn
(
i
,
forces
,
energyPtr
,
boxSize
,
invBoxSize
);
int
nextBlock
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
1
);
if
(
nextBlock
>=
neighborList
->
getNumBlocks
())
break
;
calculateBlockEwaldIxn
(
nextBlock
,
forces
,
energyPtr
,
boxSize
,
invBoxSize
);
}
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
...
@@ -367,13 +371,20 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
...
@@ -367,13 +371,20 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
else
if
(
cutoff
)
{
else
if
(
cutoff
)
{
// Compute the interactions from the neighbor list.
// Compute the interactions from the neighbor list.
for
(
int
i
=
threadIndex
;
i
<
neighborList
->
getNumBlocks
();
i
+=
numThreads
)
while
(
true
)
{
calculateBlockIxn
(
i
,
forces
,
energyPtr
,
boxSize
,
invBoxSize
);
int
nextBlock
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
1
);
if
(
nextBlock
>=
neighborList
->
getNumBlocks
())
break
;
calculateBlockIxn
(
nextBlock
,
forces
,
energyPtr
,
boxSize
,
invBoxSize
);
}
}
}
else
{
else
{
// Loop over all atom pairs
// Loop over all atom pairs
for
(
int
i
=
threadIndex
;
i
<
numberOfAtoms
;
i
+=
numThreads
){
while
(
true
)
{
int
i
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
1
);
if
(
i
>=
numberOfAtoms
)
break
;
for
(
int
j
=
i
+
1
;
j
<
numberOfAtoms
;
j
++
)
for
(
int
j
=
i
+
1
;
j
<
numberOfAtoms
;
j
++
)
if
(
exclusions
[
j
].
find
(
i
)
==
exclusions
[
j
].
end
())
if
(
exclusions
[
j
].
find
(
i
)
==
exclusions
[
j
].
end
())
calculateOneIxn
(
i
,
j
,
forces
,
energyPtr
,
boxSize
,
invBoxSize
);
calculateOneIxn
(
i
,
j
,
forces
,
energyPtr
,
boxSize
,
invBoxSize
);
...
@@ -461,12 +472,12 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
...
@@ -461,12 +472,12 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
break
;
break
;
}
}
}
}
const
float
invSwitchingInterval
=
1
/
(
cutoffDistance
-
switchingDistance
);
// Loop over neighbors for this block.
// Loop over neighbors for this block.
const
vector
<
int
>&
neighbors
=
neighborList
->
getBlockNeighbors
(
blockIndex
);
const
vector
<
int
>&
neighbors
=
neighborList
->
getBlockNeighbors
(
blockIndex
);
const
vector
<
char
>&
exclusions
=
neighborList
->
getBlockExclusions
(
blockIndex
);
const
vector
<
char
>&
exclusions
=
neighborList
->
getBlockExclusions
(
blockIndex
);
bool
include
[
4
];
for
(
int
i
=
0
;
i
<
(
int
)
neighbors
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
neighbors
.
size
();
i
++
)
{
// Load the next neighbor.
// Load the next neighbor.
...
@@ -475,43 +486,50 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
...
@@ -475,43 +486,50 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
// Compute the distances to the block atoms.
// Compute the distances to the block atoms.
bool
any
=
false
;
fvec4
dx
,
dy
,
dz
,
r2
;
fvec4
dx
,
dy
,
dz
,
r2
;
getDeltaR
(
atomPosq
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
getDeltaR
(
atomPosq
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
ivec4
include
;
include
[
j
]
=
(((
exclusions
[
i
]
>>
j
)
&
1
)
==
0
&&
(
!
cutoff
||
r2
[
j
]
<
cutoffDistance
*
cutoffDistance
));
char
excl
=
exclusions
[
i
];
any
|=
include
[
j
];
if
(
excl
==
0
)
}
include
=
-
1
;
if
(
!
any
)
else
include
=
ivec4
(
excl
&
1
?
0
:
-
1
,
excl
&
2
?
0
:
-
1
,
excl
&
4
?
0
:
-
1
,
excl
&
8
?
0
:
-
1
);
include
=
include
&
(
r2
<
cutoffDistance
*
cutoffDistance
);
if
(
!
any
(
include
))
continue
;
// No interactions to compute.
continue
;
// No interactions to compute.
// Compute the interactions.
// Compute the interactions.
fvec4
r
=
sqrt
(
r2
);
fvec4
r
=
sqrt
(
r2
);
fvec4
inverseR
=
fvec4
(
1.0
f
)
/
r
;
fvec4
inverseR
=
fvec4
(
1.0
f
)
/
r
;
fvec4
switchValue
(
1.0
f
),
switchDeriv
(
0.0
f
);
fvec4
energy
,
dEdR
;
if
(
useSwitch
)
{
float
atomEpsilon
=
atomParameters
[
atom
].
second
;
fvec4
t
=
(
r
>
switchingDistance
)
&
((
r
-
switchingDistance
)
/
(
cutoffDistance
-
switchingDistance
));
if
(
atomEpsilon
!=
0.0
f
)
{
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
/
(
cutoffDistance
-
switchingDistance
);
}
fvec4
sig
=
blockAtomSigma
+
atomParameters
[
atom
].
first
;
fvec4
sig
=
blockAtomSigma
+
atomParameters
[
atom
].
first
;
fvec4
sig2
=
inverseR
*
sig
;
fvec4
sig2
=
inverseR
*
sig
;
sig2
*=
sig2
;
sig2
*=
sig2
;
fvec4
sig6
=
sig2
*
sig2
*
sig2
;
fvec4
sig6
=
sig2
*
sig2
*
sig2
;
fvec4
eps
=
blockAtomEpsilon
*
atomParameters
[
atom
].
second
;
fvec4
epsSig6
=
blockAtomEpsilon
*
atomEpsilon
*
sig6
;
fvec4
dEdR
=
switchValue
*
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
=
epsSig6
*
(
12.0
f
*
sig6
-
6.0
f
);
energy
=
epsSig6
*
(
sig6
-
1.0
f
);
if
(
useSwitch
)
{
fvec4
t
=
(
r
>
switchingDistance
)
&
((
r
-
switchingDistance
)
*
invSwitchingInterval
);
fvec4
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
fvec4
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
*
invSwitchingInterval
;
dEdR
=
switchValue
*
dEdR
-
energy
*
switchDeriv
*
r
;
energy
*=
switchValue
;
}
}
else
{
energy
=
0.0
f
;
dEdR
=
0.0
f
;
}
fvec4
chargeProd
=
blockAtomCharge
*
posq
[
4
*
atom
+
3
];
fvec4
chargeProd
=
blockAtomCharge
*
posq
[
4
*
atom
+
3
];
if
(
cutoff
)
if
(
cutoff
)
dEdR
+=
chargeProd
*
(
inverseR
-
2.0
f
*
krf
*
r2
);
dEdR
+=
chargeProd
*
(
inverseR
-
2.0
f
*
krf
*
r2
);
else
else
dEdR
+=
chargeProd
*
inverseR
;
dEdR
+=
chargeProd
*
inverseR
;
dEdR
*=
inverseR
*
inverseR
;
dEdR
*=
inverseR
*
inverseR
;
fvec4
energy
=
eps
*
(
sig6
-
1.0
f
)
*
sig6
;
if
(
useSwitch
)
{
dEdR
-=
energy
*
switchDeriv
*
inverseR
;
energy
*=
switchValue
;
}
// Accumulate energies.
// Accumulate energies.
...
@@ -520,22 +538,20 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
...
@@ -520,22 +538,20 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
energy
+=
chargeProd
*
(
inverseR
+
krf
*
r2
-
crf
);
energy
+=
chargeProd
*
(
inverseR
+
krf
*
r2
-
crf
);
else
else
energy
+=
chargeProd
*
inverseR
;
energy
+=
chargeProd
*
inverseR
;
for
(
int
j
=
0
;
j
<
4
;
j
++
)
energy
=
blend
(
0.0
f
,
energy
,
include
);
if
(
include
[
j
])
*
totalEnergy
+=
dot4
(
energy
,
1.0
f
);
*
totalEnergy
+=
energy
[
j
];
}
}
// Accumulate forces.
// Accumulate forces.
dEdR
=
blend
(
0.0
f
,
dEdR
,
include
);
fvec4
result
[
4
]
=
{
dx
*
dEdR
,
dy
*
dEdR
,
dz
*
dEdR
,
0.0
f
};
fvec4
result
[
4
]
=
{
dx
*
dEdR
,
dy
*
dEdR
,
dz
*
dEdR
,
0.0
f
};
transpose
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
transpose
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
fvec4
atomForce
(
forces
+
4
*
atom
);
fvec4
atomForce
(
forces
+
4
*
atom
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
if
(
include
[
j
])
{
blockAtomForce
[
j
]
+=
result
[
j
];
blockAtomForce
[
j
]
+=
result
[
j
];
atomForce
-=
result
[
j
];
atomForce
-=
result
[
j
];
}
}
}
atomForce
.
store
(
forces
+
4
*
atom
);
atomForce
.
store
(
forces
+
4
*
atom
);
}
}
...
@@ -569,12 +585,12 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
...
@@ -569,12 +585,12 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
needPeriodic
=
true
;
needPeriodic
=
true
;
break
;
break
;
}
}
const
float
invSwitchingInterval
=
1
/
(
cutoffDistance
-
switchingDistance
);
// Loop over neighbors for this block.
// Loop over neighbors for this block.
const
vector
<
int
>&
neighbors
=
neighborList
->
getBlockNeighbors
(
blockIndex
);
const
vector
<
int
>&
neighbors
=
neighborList
->
getBlockNeighbors
(
blockIndex
);
const
vector
<
char
>&
exclusions
=
neighborList
->
getBlockExclusions
(
blockIndex
);
const
vector
<
char
>&
exclusions
=
neighborList
->
getBlockExclusions
(
blockIndex
);
bool
include
[
4
];
for
(
int
i
=
0
;
i
<
(
int
)
neighbors
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
neighbors
.
size
();
i
++
)
{
// Load the next neighbor.
// Load the next neighbor.
...
@@ -583,61 +599,66 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
...
@@ -583,61 +599,66 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
// Compute the distances to the block atoms.
// Compute the distances to the block atoms.
bool
any
=
false
;
fvec4
dx
,
dy
,
dz
,
r2
;
fvec4
dx
,
dy
,
dz
,
r2
;
getDeltaR
(
atomPosq
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
getDeltaR
(
atomPosq
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
ivec4
include
;
include
[
j
]
=
(((
exclusions
[
i
]
>>
j
)
&
1
)
==
0
&&
r2
[
j
]
<
cutoffDistance
*
cutoffDistance
);
char
excl
=
exclusions
[
i
];
any
|=
include
[
j
];
if
(
excl
==
0
)
}
include
=
-
1
;
if
(
!
any
)
else
include
=
ivec4
(
excl
&
1
?
0
:
-
1
,
excl
&
2
?
0
:
-
1
,
excl
&
4
?
0
:
-
1
,
excl
&
8
?
0
:
-
1
);
include
=
include
&
(
r2
<
cutoffDistance
*
cutoffDistance
);
if
(
!
any
(
include
))
continue
;
// No interactions to compute.
continue
;
// No interactions to compute.
// Compute the interactions.
// Compute the interactions.
fvec4
r
=
sqrt
(
r2
);
fvec4
r
=
sqrt
(
r2
);
fvec4
inverseR
=
fvec4
(
1.0
f
)
/
r
;
fvec4
inverseR
=
fvec4
(
1.0
f
)
/
r
;
fvec4
switchValue
(
1.0
f
),
switchDeriv
(
0.0
f
);
fvec4
energy
,
dEdR
;
if
(
useSwitch
)
{
float
atomEpsilon
=
atomParameters
[
atom
].
second
;
fvec4
t
=
(
r
>
switchingDistance
)
&
((
r
-
switchingDistance
)
/
(
cutoffDistance
-
switchingDistance
));
if
(
atomEpsilon
!=
0.0
f
)
{
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
/
(
cutoffDistance
-
switchingDistance
);
}
fvec4
chargeProd
=
blockAtomCharge
*
posq
[
4
*
atom
+
3
];
fvec4
dEdR
=
chargeProd
*
inverseR
*
ewaldScaleFunction
(
r
);
fvec4
sig
=
blockAtomSigma
+
atomParameters
[
atom
].
first
;
fvec4
sig
=
blockAtomSigma
+
atomParameters
[
atom
].
first
;
fvec4
sig2
=
inverseR
*
sig
;
fvec4
sig2
=
inverseR
*
sig
;
sig2
*=
sig2
;
sig2
*=
sig2
;
fvec4
sig6
=
sig2
*
sig2
*
sig2
;
fvec4
sig6
=
sig2
*
sig2
*
sig2
;
fvec4
eps
=
blockAtomEpsilon
*
atomParameters
[
atom
].
second
;
fvec4
epsSig6
=
blockAtomEpsilon
*
atomEpsilon
*
sig6
;
dEdR
+=
switchValue
*
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
=
epsSig6
*
(
12.0
f
*
sig6
-
6.0
f
);
dEdR
*=
inverseR
*
inverseR
;
energy
=
epsSig6
*
(
sig6
-
1.0
f
);
fvec4
energy
=
eps
*
(
sig6
-
1.0
f
)
*
sig6
;
if
(
useSwitch
)
{
if
(
useSwitch
)
{
dEdR
-=
energy
*
switchDeriv
*
inverseR
;
fvec4
t
=
(
r
>
switchingDistance
)
&
((
r
-
switchingDistance
)
*
invSwitchingInterval
);
fvec4
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
fvec4
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
*
invSwitchingInterval
;
dEdR
=
switchValue
*
dEdR
-
energy
*
switchDeriv
*
r
;
energy
*=
switchValue
;
energy
*=
switchValue
;
}
}
}
else
{
energy
=
0.0
f
;
dEdR
=
0.0
f
;
}
fvec4
chargeProd
=
blockAtomCharge
*
posq
[
4
*
atom
+
3
];
dEdR
+=
chargeProd
*
inverseR
*
ewaldScaleFunction
(
r
);
dEdR
*=
inverseR
*
inverseR
;
// Accumulate energies.
// Accumulate energies.
if
(
totalEnergy
)
{
if
(
totalEnergy
)
{
energy
+=
chargeProd
*
inverseR
*
erfcApprox
(
alphaEwald
*
r
);
energy
+=
chargeProd
*
inverseR
*
erfcApprox
(
alphaEwald
*
r
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
energy
=
blend
(
0.0
f
,
energy
,
include
);
if
(
include
[
j
])
*
totalEnergy
+=
dot4
(
energy
,
1.0
f
);
*
totalEnergy
+=
energy
[
j
];
}
}
// Accumulate forces.
// Accumulate forces.
dEdR
=
blend
(
0.0
f
,
dEdR
,
include
);
fvec4
result
[
4
]
=
{
dx
*
dEdR
,
dy
*
dEdR
,
dz
*
dEdR
,
0.0
f
};
fvec4
result
[
4
]
=
{
dx
*
dEdR
,
dy
*
dEdR
,
dz
*
dEdR
,
0.0
f
};
transpose
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
transpose
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
fvec4
atomForce
(
forces
+
4
*
atom
);
fvec4
atomForce
(
forces
+
4
*
atom
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
if
(
include
[
j
])
{
blockAtomForce
[
j
]
+=
result
[
j
];
blockAtomForce
[
j
]
+=
result
[
j
];
atomForce
-=
result
[
j
];
atomForce
-=
result
[
j
];
}
}
}
atomForce
.
store
(
forces
+
4
*
atom
);
atomForce
.
store
(
forces
+
4
*
atom
);
}
}
...
@@ -683,18 +704,14 @@ fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
...
@@ -683,18 +704,14 @@ fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
fvec4
CpuNonbondedForce
::
ewaldScaleFunction
(
fvec4
x
)
{
fvec4
CpuNonbondedForce
::
ewaldScaleFunction
(
fvec4
x
)
{
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
float
y
[
4
];
fvec4
x1
=
x
*
ewaldDXInv
;
fvec4
x1
=
x
*
ewaldDXInv
;
ivec4
index
=
floor
(
x1
);
ivec4
index
=
min
(
floor
(
x1
),
NUM_TABLE_POINTS
);
fvec4
coeff
[
4
];
fvec4
coeff2
=
x1
-
index
;
coeff
[
1
]
=
x1
-
index
;
fvec4
coeff1
=
1.0
f
-
coeff2
;
coeff
[
0
]
=
1.0
f
-
coeff
[
1
];
fvec4
t1
(
&
ewaldScaleTable
[
index
[
0
]]);
coeff
[
2
]
=
coeff
[
0
]
*
coeff
[
0
]
*
coeff
[
0
]
-
coeff
[
0
];
fvec4
t2
(
&
ewaldScaleTable
[
index
[
1
]]);
coeff
[
3
]
=
coeff
[
1
]
*
coeff
[
1
]
*
coeff
[
1
]
-
coeff
[
1
];
fvec4
t3
(
&
ewaldScaleTable
[
index
[
2
]]);
transpose
(
coeff
[
0
],
coeff
[
1
],
coeff
[
2
],
coeff
[
3
]);
fvec4
t4
(
&
ewaldScaleTable
[
index
[
3
]]);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
transpose
(
t1
,
t2
,
t3
,
t4
);
if
(
index
[
i
]
<
NUM_TABLE_POINTS
)
return
coeff1
*
t1
+
coeff2
*
t2
;
y
[
i
]
=
dot4
(
coeff
[
i
],
fvec4
(
&
ewaldScaleTable
[
4
*
index
[
i
]]));
}
return
fvec4
(
y
);
}
}
platforms/cpu/src/CpuPlatform.cpp
View file @
588a4c9d
...
@@ -32,6 +32,8 @@
...
@@ -32,6 +32,8 @@
#include "CpuPlatform.h"
#include "CpuPlatform.h"
#include "CpuKernelFactory.h"
#include "CpuKernelFactory.h"
#include "CpuKernels.h"
#include "CpuKernels.h"
#include "CpuSETTLE.h"
#include "ReferenceConstraints.h"
#include "openmm/internal/hardware.h"
#include "openmm/internal/hardware.h"
using
namespace
OpenMM
;
using
namespace
OpenMM
;
...
@@ -77,6 +79,12 @@ void CpuPlatform::contextCreated(ContextImpl& context, const map<string, string>
...
@@ -77,6 +79,12 @@ void CpuPlatform::contextCreated(ContextImpl& context, const map<string, string>
ReferencePlatform
::
contextCreated
(
context
,
properties
);
ReferencePlatform
::
contextCreated
(
context
,
properties
);
PlatformData
*
data
=
new
PlatformData
(
context
.
getSystem
().
getNumParticles
());
PlatformData
*
data
=
new
PlatformData
(
context
.
getSystem
().
getNumParticles
());
contextData
[
&
context
]
=
data
;
contextData
[
&
context
]
=
data
;
ReferenceConstraints
&
constraints
=
*
(
ReferenceConstraints
*
)
reinterpret_cast
<
ReferencePlatform
::
PlatformData
*>
(
context
.
getPlatformData
())
->
constraints
;
if
(
constraints
.
settle
!=
NULL
)
{
CpuSETTLE
*
parallelSettle
=
new
CpuSETTLE
(
context
.
getSystem
(),
*
(
ReferenceSETTLEAlgorithm
*
)
constraints
.
settle
,
data
->
threads
);
delete
constraints
.
settle
;
constraints
.
settle
=
parallelSettle
;
}
}
}
void
CpuPlatform
::
contextDestroyed
(
ContextImpl
&
context
)
const
{
void
CpuPlatform
::
contextDestroyed
(
ContextImpl
&
context
)
const
{
...
@@ -89,8 +97,7 @@ CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {
...
@@ -89,8 +97,7 @@ CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {
return
*
contextData
[
&
context
];
return
*
contextData
[
&
context
];
}
}
CpuPlatform
::
PlatformData
::
PlatformData
(
int
numParticles
)
{
CpuPlatform
::
PlatformData
::
PlatformData
(
int
numParticles
)
:
posq
(
4
*
numParticles
)
{
posq
.
resize
(
4
*
numParticles
);
int
numThreads
=
threads
.
getNumThreads
();
int
numThreads
=
threads
.
getNumThreads
();
threadForce
.
resize
(
numThreads
);
threadForce
.
resize
(
numThreads
);
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
...
...
platforms/cpu/src/CpuSETTLE.cpp
0 → 100644
View file @
588a4c9d
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CpuSETTLE.h"
using
namespace
OpenMM
;
using
namespace
std
;
class
CpuSETTLE
::
ApplyToPositionsTask
:
public
ThreadPool
::
Task
{
public:
ApplyToPositionsTask
(
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
vector
<
OpenMM
::
RealVec
>&
atomCoordinatesP
,
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
,
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
)
:
atomCoordinates
(
atomCoordinates
),
atomCoordinatesP
(
atomCoordinatesP
),
inverseMasses
(
inverseMasses
),
tolerance
(
tolerance
),
threadSettle
(
threadSettle
)
{
}
void
execute
(
ThreadPool
&
threads
,
int
threadIndex
)
{
threadSettle
[
threadIndex
]
->
apply
(
atomCoordinates
,
atomCoordinatesP
,
inverseMasses
,
tolerance
);
}
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
;
vector
<
OpenMM
::
RealVec
>&
atomCoordinatesP
;
vector
<
RealOpenMM
>&
inverseMasses
;
RealOpenMM
tolerance
;
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
;
};
class
CpuSETTLE
::
ApplyToVelocitiesTask
:
public
ThreadPool
::
Task
{
public:
ApplyToVelocitiesTask
(
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
vector
<
OpenMM
::
RealVec
>&
velocities
,
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
,
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
)
:
atomCoordinates
(
atomCoordinates
),
velocities
(
velocities
),
inverseMasses
(
inverseMasses
),
tolerance
(
tolerance
),
threadSettle
(
threadSettle
)
{
}
void
execute
(
ThreadPool
&
threads
,
int
threadIndex
)
{
threadSettle
[
threadIndex
]
->
applyToVelocities
(
atomCoordinates
,
velocities
,
inverseMasses
,
tolerance
);
}
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
;
vector
<
OpenMM
::
RealVec
>&
velocities
;
vector
<
RealOpenMM
>&
inverseMasses
;
RealOpenMM
tolerance
;
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
;
};
CpuSETTLE
::
CpuSETTLE
(
const
System
&
system
,
const
ReferenceSETTLEAlgorithm
&
settle
,
ThreadPool
&
threads
)
:
threads
(
threads
)
{
int
numThreads
=
threads
.
getNumThreads
();
int
numClusters
=
settle
.
getNumClusters
();
vector
<
RealOpenMM
>
mass
(
system
.
getNumParticles
());
for
(
int
i
=
0
;
i
<
system
.
getNumParticles
();
i
++
)
mass
[
i
]
=
system
.
getParticleMass
(
i
);
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
{
int
start
=
i
*
numClusters
/
numThreads
;
int
end
=
(
i
+
1
)
*
numClusters
/
numThreads
;
if
(
start
!=
end
)
{
int
numThreadClusters
=
end
-
start
;
vector
<
int
>
atom1
(
numThreadClusters
),
atom2
(
numThreadClusters
),
atom3
(
numThreadClusters
);
vector
<
RealOpenMM
>
distance1
(
numThreadClusters
),
distance2
(
numThreadClusters
);
for
(
int
j
=
0
;
j
<
numThreadClusters
;
j
++
)
settle
.
getClusterParameters
(
start
+
j
,
atom1
[
j
],
atom2
[
j
],
atom3
[
j
],
distance1
[
j
],
distance2
[
j
]);
threadSettle
.
push_back
(
new
ReferenceSETTLEAlgorithm
(
atom1
,
atom2
,
atom3
,
distance1
,
distance2
,
mass
));
}
}
}
CpuSETTLE
::~
CpuSETTLE
()
{
for
(
int
i
=
0
;
i
<
(
int
)
threadSettle
.
size
();
i
++
)
delete
threadSettle
[
i
];
}
void
CpuSETTLE
::
apply
(
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
vector
<
OpenMM
::
RealVec
>&
atomCoordinatesP
,
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
)
{
ApplyToPositionsTask
task
(
atomCoordinates
,
atomCoordinatesP
,
inverseMasses
,
tolerance
,
threadSettle
);
threads
.
execute
(
task
);
threads
.
waitForThreads
();
}
void
CpuSETTLE
::
applyToVelocities
(
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
vector
<
OpenMM
::
RealVec
>&
velocities
,
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
)
{
ApplyToVelocitiesTask
task
(
atomCoordinates
,
velocities
,
inverseMasses
,
tolerance
,
threadSettle
);
threads
.
execute
(
task
);
threads
.
waitForThreads
();
}
platforms/cpu/src/gmx_atomic.h
0 → 100644
View file @
588a4c9d
/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*-
*
* Copyright (c) 2004-2008, Erik Lindahl <lindahl@cbr.su.se>
*
* Unfortunately, some of the constructs in this file are _very_ sensitive
* to compiler optimizations and architecture changes. If you find any such
* errors, please send a message to lindahl@cbr.su.se to help us fix the
* upstream version too.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* And Hey:
* Gnomes, ROck Monsters And Chili Sauce
*/
#ifndef _GMX_ATOMIC_H_
#define _GMX_ATOMIC_H_
/*! \file gmx_atomic.h
*
* @brief Atomic operations for fast SMP synchronization
*
* This file defines atomic integer operations and spinlocks for
* fast synchronization in performance-critical regions of gromacs.
*
* In general, the best option is to use functions without explicit
* locking, e.g. gmx_atomic_fetch_add() or gmx_atomic_cmpxchg().
*
* Not all architecture support atomic operations though inline assembly,
* and even if they do it might not be implemented here. In that case
* we use a fallback mutex implementation, so you can always count on
* the function interfaces working in Gromacs.
*
* Don't use spinlocks in non-performance-critical regions like file I/O.
* Since they always spin busy they would waste CPU cycles instead of
* properly yielding to a computation thread while waiting for the disk.
*
* Finally, note that all our spinlock operations are defined to return
* 0 if initialization or locking completes successfully.
* This is the opposite of some other implementations, but the same standard
* as used for pthread mutexes. So, if e.g. are trying to lock a spinlock,
* you will have gotten the lock if the return value is 0.
*
* gmx_spinlock_islocked(x) obviously still returns 1 if the lock is locked,
* and 0 if it is available, though...
*/
#include <stdio.h>
#include <pthread.h>
#ifdef __cplusplus
extern
"C"
{
#endif
#if 0
} /* Avoids screwing up auto-indentation */
#endif
#if ( ( (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__)) && \
(defined(i386) || defined(__x86_64__)) ) \
|| defined (DOXYGEN) )
/* This code is executed for x86 and x86-64, with these compilers:
* GNU
* Intel
* Pathscale
* All these support GCC-style inline assembly.
* We also use this section for the documentation.
*/
/*! \brief Memory barrier operation
*
* Modern CPUs rely heavily on out-of-order execution, and one common feature
* is that load/stores might be reordered. Also, when using inline assembly
* the compiler might already have loaded the variable we are changing into
* a register, so any update to memory won't be visible.
*
* This command creates a memory barrier, i.e. all memory results before
* it in the code should be visible to all memory operations after it - the
* CPU cannot propagate load/stores across it.
*/
#define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory")
/* Only gcc and Intel support this check, otherwise set it to true (skip doc) */
#if (!defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined DOXYGEN)
#define __builtin_constant_p(i) (1)
#endif
/*! \brief Gromacs atomic operations datatype
*
* Portable synchronization primitives like mutexes are effective for
* many purposes, but usually not very high performance.
* One of the problem is that you have the overhead of a function call,
* and another is that Mutexes often have extra overhead to make the
* scheduling fair. Finally, if performance is important we don't want
* to suspend the thread if we cannot lock a mutex, but spin-lock at 100%
* CPU usage until the resources is available (e.g. increment a counter).
*
* These things can often be implemented with inline-assembly or other
* system-dependent functions, and we provide such functionality for the
* most common platforms. For portability we also have a fallback
* implementation using a mutex for locking.
*
* Performance-wise, the fastest solution is always to avoid locking
* completely (obvious, but remember it!). If you cannot do that, the
* next best thing is to use atomic operations that e.g. increment a
* counter without explicit locking. Spinlocks are useful to lock an
* entire region, but leads to more overhead and can be difficult to
* debug - it is up to you to make sure that only the thread owning the
* lock unlocks it!
*
* You should normally NOT use atomic operations for things like
* I/O threads. These should yield to other threads while waiting for
* the disk instead of spinning at 100% CPU usage.
*
* It is imperative that you use the provided routines for reading
* and writing, since some implementations require memory barriers before
* the CPU or memory sees an updated result. The structure contents is
* only visible here so it can be inlined for performance - it might
* change without further notice.
*
* \note No initialization is required for atomic variables.
*
* Currently, we have (real) atomic operations for:
*
* - x86 or x86_64, using GNU compilers
* - x86 or x86_64, using Intel compilers
* - x86 or x86_64, using Pathscale compilers
* - Itanium, using GNU compilers
* - Itanium, using Intel compilers
* - Itanium, using HP compilers
* - PowerPC, using GNU compilers
* - PowerPC, using IBM AIX compilers
* - PowerPC, using IBM compilers >=7.0 under Linux or Mac OS X.
*/
typedef
struct
gmx_atomic
{
volatile
int
value
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t
;
/*! \brief Gromacs spinlock
*
* Spinlocks provide a faster synchronization than mutexes,
* although they consume CPU-cycles while waiting. They are implemented
* with atomic operations and inline assembly whenever possible, and
* otherwise we use a fallback implementation where a spinlock is identical
* to a mutex (this is one of the reasons why you have to initialize them).
*
* There are no guarantees whatsoever about fair scheduling or
* debugging if you make a mistake and unlock a variable somebody
* else has locked - performance is the primary goal of spinlocks.
*
*/
typedef
struct
gmx_spinlock
{
volatile
unsigned
int
lock
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t
;
/*! \brief Spinlock static initializer
*
* This is used for static spinlock initialization, and has the same
* properties as GMX_THREAD_MUTEX_INITIALIZER has for mutexes.
* This is only for inlining in the gmx_thread.h header file. Whether
* it is 0, 1, or something else when unlocked depends on the platform.
* Don't assume anything about it. It might even be a mutex when using the
* fallback implementation!
*/
#define GMX_SPINLOCK_INITIALIZER { 1 }
/*! \brief Return value of an atomic integer
*
* Also implements proper memory barriers when necessary.
* The actual implementation is system-dependent.
*
* \param a Atomic variable to read
* \return Integer value of the atomic variable
*/
#define gmx_atomic_read(a) ((a)->value)
/*! \brief Write value to an atomic integer
*
* Also implements proper memory barriers when necessary.
* The actual implementation is system-dependent.
*
* \param a Atomic variable
* \param i Integer to set the atomic variable to.
*/
#define gmx_atomic_set(a,i) (((a)->value) = (i))
/*! \brief Add integer to atomic variable
*
* Also implements proper memory barriers when necessary.
* The actual implementation is system-dependent.
*
* \param a atomic datatype to modify
* \param i integer to increment with. Use i<0 to subtract atomically.
*
* \return The new value (after summation).
*/
static
inline
int
gmx_atomic_add_return
(
gmx_atomic_t
*
a
,
volatile
int
i
)
{
int
__i
;
__i
=
i
;
__asm__
__volatile__
(
"lock ; xaddl %0, %1;"
:
"=r"
(
i
)
:
"m"
(
a
->
value
),
"0"
(
i
));
return
i
+
__i
;
}
/*! \brief Add to variable, return the old value.
*
* This operation is quite useful for synchronization counters.
* By performing a fetchadd with N, a thread can e.g. reserve a chunk
* with the next N iterations, and the return value is the index
* of the first element to treat.
*
* Also implements proper memory barriers when necessary.
* The actual implementation is system-dependent.
*
* \param a atomic datatype to modify
* \param i integer to increment with. Use i<0 to subtract atomically.
*
* \return The value of the atomic variable before addition.
*/
static
inline
int
gmx_atomic_fetch_add
(
gmx_atomic_t
*
a
,
volatile
int
i
)
{
int
__i
;
__i
=
i
;
__asm__
__volatile__
(
"lock ; xaddl %0, %1;"
:
"=r"
(
i
)
:
"m"
(
a
->
value
),
"0"
(
i
));
return
i
;
}
/*! \brief Atomic compare-exchange operation
*
* The \a old value is compared with the memory value in the atomic datatype.
* If the are identical, the atomic type is updated to the new value,
* and otherwise left unchanged.
*
* This is a very useful synchronization primitive: You can start by reading
* a value (without locking anything), perform some calculations, and then
* atomically try to update it in memory unless it has changed. If it has
* changed you will get an error return code - reread the new value
* an repeat the calculations in that case.
*
* \param a Atomic datatype ('memory' value)
* \param oldval Integer value read from the atomic type at an earlier point
* \param newval New value to write to the atomic type if it currently is
* identical to the old value.
*
* \return The value of the atomic memory variable in memory when this
* instruction was executed. This, if the operation succeeded the
* return value was identical to the \a old parameter, and if not
* it returns the updated value in memory so you can repeat your
* operations on it.
*
* \note The exchange occured if the return value is identical to \a old.
*/
static
inline
int
gmx_atomic_cmpxchg
(
gmx_atomic_t
*
a
,
int
oldval
,
int
newval
)
{
volatile
unsigned
long
prev
;
__asm__
__volatile__
(
"lock ; cmpxchgl %1,%2"
:
"=a"
(
prev
)
:
"q"
(
newval
),
"m"
(
a
->
value
),
"0"
(
oldval
)
:
"memory"
);
return
prev
;
}
/*! \brief Initialize spinlock
*
* In theory you can call this from multiple threads, but remember
* that we don't check for errors. If the first thread proceeded to
* lock the spinlock after initialization, the second will happily
* overwrite the contents and unlock it without warning you.
*
* \param x Gromacs spinlock pointer.
*/
static
inline
void
gmx_spinlock_init
(
gmx_spinlock_t
*
x
)
{
x
->
lock
=
1
;
}
/*! \brief Acquire spinlock
*
* This routine blocks until the spinlock is available, and
* the locks it again before returning.
*
* \param x Gromacs spinlock pointer
*/
static
inline
void
gmx_spinlock_lock
(
gmx_spinlock_t
*
x
)
{
__asm__
__volatile__
(
"
\n
1:
\t
"
"lock ; decb %0
\n\t
"
"jns 3f
\n
"
"2:
\t
"
"rep;nop
\n\t
"
"cmpb $0,%0
\n\t
"
"jle 2b
\n\t
"
"jmp 1b
\n
"
"3:
\n\t
"
:
"=m"
(
x
->
lock
)
:
:
"memory"
);
}
/*! \brief Attempt to acquire spinlock
*
* This routine acquires the spinlock if possible, but if
* already locked it return an error code immediately.
*
* \param x Gromacs spinlock pointer
*
* \return 0 if the mutex was available so we could lock it,
* otherwise a non-zero integer (1) if the lock is busy.
*/
static
inline
int
gmx_spinlock_trylock
(
gmx_spinlock_t
*
x
)
{
char
old_value
;
__asm__
__volatile__
(
"xchgb %b0,%1"
:
"=q"
(
old_value
),
"=m"
(
x
->
lock
)
:
"0"
(
0
)
:
"memory"
);
return
(
old_value
<=
0
);
}
/*! \brief Release spinlock
*
* \param x Gromacs spinlock pointer
*
* Unlocks the spinlock, regardless if which thread locked it.
*/
static
inline
void
gmx_spinlock_unlock
(
gmx_spinlock_t
*
x
)
{
char
old_value
=
1
;
__asm__
__volatile__
(
"xchgb %b0, %1"
:
"=q"
(
old_value
),
"=m"
(
x
->
lock
)
:
"0"
(
old_value
)
:
"memory"
);
}
/*! \brief Check if spinlock is locked
*
* This routine returns immediately with the lock status.
*
* \param x Gromacs spinlock pointer
*
* \return 1 if the spinlock is locked, 0 otherwise.
*/
static
inline
int
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
return
(
*
(
volatile
signed
char
*
)(
&
(
x
)
->
lock
)
<=
0
);
}
/*! \brief Wait for a spinlock to become available
*
* This routine blocks until the spinlock is unlocked,
* but in contrast to gmx_spinlock_lock() it returns without
* trying to lock the spinlock.
*
* \param x Gromacs spinlock pointer
*/
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
do
{
gmx_atomic_memory_barrier
();
}
while
(
gmx_spinlock_islocked
(
x
));
}
#elif ( defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__)))
/* PowerPC using proper GCC inline assembly.
* Recent versions of xlC (>=7.0) _partially_ support this, but since it is
* not 100% compatible we provide a separate implementation for xlC in
* the next section.
*/
/* Compiler-dependent stuff: GCC memory barrier */
#define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory")
typedef
struct
gmx_atomic
{
volatile
int
value
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t
;
typedef
struct
gmx_spinlock
{
volatile
unsigned
int
lock
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t
;
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
static
inline
int
gmx_atomic_add_return
(
gmx_atomic_t
*
a
,
int
i
)
{
int
t
;
__asm__
__volatile__
(
"1: lwarx %0,0,%2
\n
"
"
\t
add %0,%1,%0
\n
"
"
\t
stwcx. %0,0,%2
\n
"
"
\t
bne- 1b"
"
\t
isync
\n
"
:
"=&r"
(
t
)
:
"r"
(
i
),
"r"
(
&
a
->
value
)
:
"cc"
,
"memory"
);
return
t
;
}
static
inline
int
gmx_atomic_fetch_add
(
gmx_atomic_t
*
a
,
int
i
)
{
int
t
;
__asm__
__volatile__
(
"
\t
eieio
\n
"
"1: lwarx %0,0,%2
\n
"
"
\t
add %0,%1,%0
\n
"
"
\t
stwcx. %0,0,%2
\n
"
"
\t
bne- 1b
\n
"
"
\t
isync
\n
"
:
"=&r"
(
t
)
:
"r"
(
i
),
"r"
(
&
a
->
value
)
:
"cc"
,
"memory"
);
return
(
t
-
i
);
}
static
inline
int
gmx_atomic_cmpxchg
(
gmx_atomic_t
*
a
,
int
oldval
,
int
newval
)
{
int
prev
;
__asm__
__volatile__
(
"1: lwarx %0,0,%2
\n
"
"
\t
cmpw 0,%0,%3
\n
"
"
\t
bne 2f
\n
"
"
\t
stwcx. %4,0,%2
\n
"
"bne- 1b
\n
"
"
\t
sync
\n
"
"2:
\n
"
:
"=&r"
(
prev
),
"=m"
(
a
->
value
)
:
"r"
(
&
a
->
value
),
"r"
(
oldval
),
"r"
(
newval
),
"m"
(
a
->
value
)
:
"cc"
,
"memory"
);
return
prev
;
}
static
inline
void
gmx_spinlock_init
(
gmx_spinlock_t
*
x
)
{
x
->
lock
=
0
;
}
static
inline
void
gmx_spinlock_lock
(
gmx_spinlock_t
*
x
)
{
unsigned
int
tmp
;
__asm__
__volatile__
(
"
\t
b 1f
\n
"
"2: lwzx %0,0,%1
\n
"
"
\t
cmpwi 0,%0,0
\n
"
"
\t
bne+ 2b
\n
"
"1: lwarx %0,0,%1
\n
"
"
\t
cmpwi 0,%0,0
\n
"
"
\t
bne- 2b
\n
"
"
\t
stwcx. %2,0,%1
\n
"
"bne- 2b
\n
"
"
\t
isync
\n
"
:
"=&r"
(
tmp
)
:
"r"
(
&
x
->
lock
),
"r"
(
1
)
:
"cr0"
,
"memory"
);
}
static
inline
int
gmx_spinlock_trylock
(
gmx_spinlock_t
*
x
)
{
unsigned
int
old
,
t
;
unsigned
int
mask
=
1
;
volatile
unsigned
int
*
p
=
&
x
->
lock
;
__asm__
__volatile__
(
"
\t
eieio
\n
"
"1: lwarx %0,0,%4
\n
"
"
\t
or %1,%0,%3
\n
"
"
\t
stwcx. %1,0,%4
\n
"
"
\t
bne 1b
\n
"
"
\t
sync
\n
"
:
"=&r"
(
old
),
"=&r"
(
t
),
"=m"
(
*
p
)
:
"r"
(
mask
),
"r"
(
p
),
"m"
(
*
p
)
:
"cc"
,
"memory"
);
return
((
old
&
mask
)
!=
0
);
}
static
inline
void
gmx_spinlock_unlock
(
gmx_spinlock_t
*
x
)
{
__asm__
__volatile__
(
"
\t
eieio
\n
"
:
:
:
"memory"
);
x
->
lock
=
0
;
}
static
inline
int
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
return
(
x
->
lock
!=
0
);
}
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
do
{
gmx_atomic_memory_barrier
();
}
while
(
gmx_spinlock_islocked
(
x
));
}
#elif ( (defined(__IBM_GCC_ASM) || defined(__IBM_STDCPP_ASM)) && \
(defined(__powerpc__) || defined(__ppc__)))
/* PowerPC using xlC inline assembly.
* Recent versions of xlC (>=7.0) _partially_ support GCC inline assembly
* if you use the option -qasm=gcc but we have had to hack things a bit, in
* particular when it comes to clobbered variables. Since this implementation
* _could_ be buggy, we have separated it from the known-to-be-working gcc
* one above.
*/
/* memory barrier - no idea how to create one with xlc! */
#define gmx_atomic_memory_barrier()
typedef
struct
gmx_atomic
{
volatile
int
value
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t
;
typedef
struct
gmx_spinlock
{
volatile
unsigned
int
lock
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t
;
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
static
inline
int
gmx_atomic_add_return
(
gmx_atomic_t
*
a
,
int
i
)
{
int
t
;
__asm__
__volatile__
(
"1: lwarx %0,0,%2
\n
"
"
\t
add %0,%1,%0
\n
"
"
\t
stwcx. %0,0,%2
\n
"
"
\t
bne- 1b
\n
"
"
\t
isync
\n
"
:
"=&r"
(
t
)
:
"r"
(
i
),
"r"
(
&
a
->
value
)
);
return
t
;
}
static
inline
int
gmx_atomic_fetch_add
(
gmx_atomic_t
*
a
,
int
i
)
{
int
t
;
__asm__
__volatile__
(
"
\t
eieio
\n
"
"1: lwarx %0,0,%2
\n
"
"
\t
add %0,%1,%0
\n
"
"
\t
stwcx. %0,0,%2
\n
"
"
\t
bne- 1b
\n
"
"
\t
isync
\n
"
:
"=&r"
(
t
)
:
"r"
(
i
),
"r"
(
&
a
->
value
));
return
(
t
-
i
);
}
static
inline
int
gmx_atomic_cmpxchg
(
gmx_atomic_t
*
a
,
int
oldval
,
int
newval
)
{
int
prev
;
__asm__
__volatile__
(
"1: lwarx %0,0,%2
\n
"
"
\t
cmpw 0,%0,%3
\n
"
"
\t
bne 2f
\n
"
"
\t
stwcx. %4,0,%2
\n
"
"
\t
bne- 1b
\n
"
"
\t
sync
\n
"
"2:
\n
"
:
"=&r"
(
prev
),
"=m"
(
a
->
value
)
:
"r"
(
&
a
->
value
),
"r"
(
oldval
),
"r"
(
newval
),
"m"
(
a
->
value
));
return
prev
;
}
static
inline
void
gmx_spinlock_init
(
gmx_spinlock_t
*
x
)
{
x
->
lock
=
0
;
}
static
inline
void
gmx_spinlock_lock
(
gmx_spinlock_t
*
x
)
{
unsigned
int
tmp
;
__asm__
__volatile__
(
"
\t
b 1f
\n
"
"2: lwzx %0,0,%1
\n
"
"
\t
cmpwi 0,%0,0
\n
"
"
\t
bne+ 2b
\n
"
"1: lwarx %0,0,%1
\n
"
"
\t
cmpwi 0,%0,0
\n
"
"
\t
bne- 2b
\n
"
"
\t
stwcx. %2,0,%1
\n
"
"
\t
bne- 2b
\n
"
"
\t
isync
\n
"
:
"=&r"
(
tmp
)
:
"r"
(
&
x
->
lock
),
"r"
(
1
));
}
static
inline
int
gmx_spinlock_trylock
(
gmx_spinlock_t
*
x
)
{
unsigned
int
old
,
t
;
unsigned
int
mask
=
1
;
volatile
unsigned
int
*
p
=
&
x
->
lock
;
__asm__
__volatile__
(
"
\t
eieio
\n
"
"1: lwarx %0,0,%4
\n
"
"
\t
or %1,%0,%3
\n
"
"
\t
stwcx. %1,0,%4
\n
"
"
\t
bne 1b
\n
"
"
\t
sync
\n
"
:
"=&r"
(
old
),
"=&r"
(
t
),
"=m"
(
*
p
)
:
"r"
(
mask
),
"r"
(
p
),
"m"
(
*
p
));
return
((
old
&
mask
)
!=
0
);
}
static
inline
void
gmx_spinlock_unlock
(
gmx_spinlock_t
*
x
)
{
__asm__
__volatile__
(
"
\t
eieio
\n
"
);
x
->
lock
=
0
;
}
static
inline
void
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
return
(
x
->
lock
!=
0
);
}
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
do
{
gmx_atomic_memory_barrier
();
}
while
(
gmx_spinlock_islocked
(
x
));
}
#elif (defined(__ia64__) && (defined(__GNUC__) || defined(__INTEL_COMPILER)))
/* ia64 with GCC or Intel compilers. Since we need to define everything through
* cmpxchg and fetchadd on ia64, we merge the different compilers and only provide
* different implementations for that single function.
* Documentation? Check the gcc/x86 section.
*/
typedef
struct
gmx_atomic
{
volatile
int
value
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t
;
typedef
struct
gmx_spinlock
{
volatile
unsigned
int
lock
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t
;
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
/* Compiler thingies */
#ifdef __INTEL_COMPILER
void
__memory_barrier
(
void
);
int
_InterlockedCompareExchange
(
volatile
int
*
dest
,
int
xchg
,
int
comp
);
unsigned
__int64
__fetchadd4_rel
(
unsigned
int
*
addend
,
const
int
increment
);
/* ia64 memory barrier */
# define gmx_atomic_memory_barrier() __memory_barrier()
/* ia64 cmpxchg */
# define gmx_atomic_cmpxchg(a, oldval, newval) _InterlockedCompareExchange(&a->value,newval,oldval)
/* ia64 fetchadd, but it only works with increments +/- 1,4,8,16 */
# define gmx_ia64_fetchadd(a, inc) __fetchadd4_rel(a, inc)
#elif defined __GNUC__
/* ia64 memory barrier */
# define gmx_atomic_memory_barrier() asm volatile ("":::"memory")
/* ia64 cmpxchg */
static
inline
int
gmx_atomic_cmpxchg
(
gmx_atomic_t
*
a
,
int
oldval
,
int
newval
)
{
volatile
int
res
;
asm
volatile
(
"mov ar.ccv=%0;;"
::
"rO"
(
oldval
));
asm
volatile
(
"cmpxchg4.acq %0=[%1],%2,ar.ccv"
:
"=r"
(
res
)
:
"r"
(
&
a
->
value
),
"r"
(
newval
)
:
"memory"
);
return
res
;
}
/* fetchadd, but on ia64 it only works with increments +/- 1,4,8,16 */
#define gmx_ia64_fetchadd(a, inc) \
({ unsigned long res; \
asm volatile ("fetchadd4.rel %0=[%1],%2" \
: "=r"(res) : "r"(a), "i" (inc) : "memory"); \
res; \
})
#else
/* Unknown compiler */
# error Unknown ia64 compiler (not GCC or ICC) - modify gmx_thread.h!
#endif
static
inline
int
gmx_atomic_add_return
(
gmx_atomic_t
*
a
,
volatile
int
i
)
{
volatile
int
oldval
,
newval
;
volatile
int
__i
=
i
;
/* Use fetchadd if, and only if, the increment value can be determined
* at compile time (otherwise this check is optimized away) and it is
* a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16).
*/
if
(
__builtin_constant_p
(
i
)
&&
(
(
__i
==
1
)
||
(
__i
==
4
)
||
(
__i
==
8
)
||
(
__i
==
16
)
||
(
__i
==
-
1
)
||
(
__i
==
-
4
)
||
(
__i
==
-
8
)
||
(
__i
==
-
16
)
)
)
{
oldval
=
gmx_ia64_fetchadd
(
a
,
__i
);
newval
=
oldval
+
i
;
}
else
{
/* Use compare-exchange addition that works with any value */
do
{
oldval
=
gmx_atomic_read
(
a
);
newval
=
oldval
+
i
;
}
while
(
gmx_atomic_cmpxchg
(
a
,
oldval
,
newval
)
!=
oldval
);
}
return
newval
;
}
static
inline
int
gmx_atomic_fetch_add
(
gmx_atomic_t
*
a
,
volatile
int
i
)
{
volatile
int
oldval
,
newval
;
volatile
int
__i
=
i
;
/* Use ia64 fetchadd if, and only if, the increment value can be determined
* at compile time (otherwise this check is optimized away) and it is
* a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16).
*/
if
(
__builtin_constant_p
(
i
)
&&
(
(
__i
==
1
)
||
(
__i
==
4
)
||
(
__i
==
8
)
||
(
__i
==
16
)
||
(
__i
==
-
1
)
||
(
__i
==
-
4
)
||
(
__i
==
-
8
)
||
(
__i
==
-
16
)
)
)
{
oldval
=
gmx_ia64_fetchadd
(
a
,
__i
);
newval
=
oldval
+
i
;
}
else
{
/* Use compare-exchange addition that works with any value */
do
{
oldval
=
gmx_atomic_read
(
a
);
newval
=
oldval
+
i
;
}
while
(
gmx_atomic_cmpxchg
(
a
,
oldval
,
newval
)
!=
oldval
);
}
return
oldval
;
}
static
inline
void
gmx_spinlock_init
(
gmx_spinlock_t
*
x
)
{
x
->
lock
=
0
;
}
static
inline
void
gmx_spinlock_lock
(
gmx_spinlock_t
*
x
)
{
gmx_atomic_t
*
a
=
(
gmx_atomic_t
*
)
x
;
unsigned
long
value
;
value
=
gmx_atomic_cmpxchg
(
a
,
0
,
1
);
if
(
value
)
{
do
{
while
(
a
->
value
!=
0
)
{
gmx_atomic_memory_barrier
();
}
value
=
gmx_atomic_cmpxchg
(
a
,
0
,
1
);
}
while
(
value
);
}
}
static
inline
int
gmx_spinlock_trylock
(
gmx_spinlock_t
*
x
)
{
return
(
gmx_atomic_cmpxchg
((
gmx_atomic_t
*
)
x
,
0
,
1
)
!=
0
);
}
static
inline
void
gmx_spinlock_unlock
(
gmx_spinlock_t
*
x
)
{
do
{
gmx_atomic_memory_barrier
();
x
->
lock
=
0
;
}
while
(
0
);
}
static
inline
int
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
return
(
x
->
lock
!=
0
);
}
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
do
{
gmx_atomic_memory_barrier
();
}
while
(
gmx_spinlock_islocked
(
x
));
}
#undef gmx_ia64_fetchadd
#elif (defined(__hpux) || defined(__HP_cc)) && defined(__ia64)
/* HP compiler on ia64 */
#include <machine/sys/inline.h>
#define gmx_atomic_memory_barrier() _Asm_mf()
#define gmx_hpia64_fetchadd(a, i) \
_Asm_fetchadd((_Asm_fasz)_FASZ_W,(_Asm_sem)_SEM_REL, \
(UInt32*)a,(unsigned int) i, \
(_Asm_ldhint)LDHINT_NONE)
typedef
struct
gmx_atomic
{
volatile
int
value
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t
;
typedef
struct
gmx_spinlock
{
volatile
unsigned
int
lock
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t
;
static
inline
int
gmx_atomic_cmpxchg
(
gmx_atomic_t
*
a
,
int
oldval
,
int
newval
)
{
int
ret
;
_Asm_mov_to_ar
((
_Asm_app_reg
)
_AREG_CCV
,(
Uint32
)
oldval
,
(
_Asm_fence
)(
_UP_CALL_FENCE
|
_UP_SYS_FENCE
|
_DOWN_CALL_FENCE
|
_DOWN_SYS_FENCE
));
ret
=
_Asm_cmpxchg
((
_Asm_sz
)
SZ_W
,(
_Asm_sem
)
_SEM_ACQ
,(
Uint32
*
)
a
,
(
Uint32
)
newval
,(
_Asm_ldhint
)
_LDHINT_NONE
);
return
ret
;
}
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
static
inline
void
gmx_atomic_add_return
(
gmx_atomic_t
*
a
,
int
i
)
{
int
old
,
new
;
int
__i
=
i
;
/* On HP-UX we don't know any macro to determine whether the increment
* is known at compile time, but hopefully the call uses something simple
* like a constant, and then the optimizer should be able to do the job.
*/
if
(
(
__i
==
1
)
||
(
__i
==
4
)
||
(
__i
==
8
)
||
(
__i
==
16
)
||
(
__i
==
-
1
)
||
(
__i
==
-
4
)
||
(
__i
==
-
8
)
||
(
__i
==
-
16
)
)
{
oldval
=
gmx_hpia64_fetchadd
(
a
,
__i
);
newval
=
oldval
+
i
;
}
else
{
/* Use compare-exchange addition that works with any value */
do
{
oldval
=
gmx_atomic_read
(
a
);
newval
=
oldval
+
i
;
}
while
(
gmx_atomic_cmpxchg
(
a
,
oldval
,
newval
)
!=
oldval
);
}
return
newval
;
}
static
inline
int
gmx_atomic_fetch_add
(
gmx_atomic_t
*
a
,
int
i
)
{
int
oldval
,
newval
;
int
__i
=
i
;
/* On HP-UX we don't know any macro to determine whether the increment
* is known at compile time, but hopefully the call uses something simple
* like a constant, and then the optimizer should be able to do the job.
*/
if
(
(
__i
==
1
)
||
(
__i
==
4
)
||
(
__i
==
8
)
||
(
__i
==
16
)
||
(
__i
==
-
1
)
||
(
__i
==
-
4
)
||
(
__i
==
-
8
)
||
(
__i
==
-
16
)
)
{
oldval
=
gmx_hpia64_fetchadd
(
a
,
__i
);
newval
=
oldval
+
i
;
}
else
{
/* Use compare-exchange addition that works with any value */
do
{
oldval
=
gmx_atomic_read
(
a
);
newval
=
oldval
+
i
;
}
while
(
gmx_atomic_cmpxchg
(
a
,
oldval
,
newval
)
!=
oldval
);
}
return
oldval
;
}
static
inline
void
gmx_spinlock_init
(
gmx_spinlock_t
*
x
)
{
x
->
lock
=
0
;
}
static
inline
void
gmx_spinlock_trylock
(
gmx_spinlock_t
*
x
)
{
int
rc
;
rc
=
_Asm_xchg
((
_Asm_sz
)
_SZ_W
,
(
unsigned
int
*
)
x
,
1
(
_Asm_ldhit
)
_LDHINT_NONE
);
return
(
(
rc
>
0
)
?
1
:
0
);
}
static
inline
void
gmx_spinlock_lock
(
gmx_spinlock_t
*
x
)
{
int
status
=
1
;
do
{
if
(
*
((
unsigned
int
*
)
x
->
lock
)
==
0
)
{
status
=
gmx_spinlock_trylock
(
x
);
}
}
while
(
status
!=
0
);
}
static
inline
void
gmx_spinlock_unlock
(
gmx_spinlock_t
*
x
)
{
_Asm_fetchadd
((
_Asm_fasz
)
_SZ_W
,(
_Asm_sem
)
_SEM_REL
,
(
unsigned
int
*
)
x
,
-
1
,(
_Asm_ldhint
)
_LDHINT_NONE
);
}
static
inline
void
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
return
(
x
->
lock
!=
0
);
}
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
do
{
gmx_atomic_memory_barrier
();
}
while
(
gmx_spinlock_islocked
(
x
));
}
#undef gmx_hpia64_fetchadd
#elif (defined(_MSC_VER) && (_MSC_VER >= 1200))
/* Microsoft Visual C on x86, define taken from FFTW who got it from Morten Nissov */
#include <windows.h>
#define gmx_atomic_memory_barrier()
typedef
struct
gmx_atomic
{
LONG
volatile
value
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t
;
typedef
struct
gmx_spinlock
{
LONG
volatile
lock
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t
;
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
#define gmx_atomic_fetch_add(a, i) \
InterlockedExchangeAdd((LONG volatile *)a, (LONG) i)
#define gmx_atomic_add_return(a, i) \
( i + InterlockedExchangeAdd((LONG volatile *)a, (LONG) i) )
#define gmx_atomic_cmpxchg(a, oldval, newval) \
InterlockedCompareExchange((LONG volatile *)a, (LONG) newval, (LONG) oldval)
# define gmx_spinlock_lock(x) \
while((InterlockedCompareExchange((LONG volatile *)&x, 1, 0))!=0)
#define gmx_spinlock_trylock(x) \
InterlockedCompareExchange((LONG volatile *)&x, 1, 0)
static
inline
void
gmx_spinlock_unlock
(
gmx_spinlock_t
*
x
)
{
x
->
lock
=
0
;
}
static
inline
int
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
return
(
*
(
volatile
signed
char
*
)(
&
(
x
)
->
lock
)
!=
0
);
}
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
while
(
gmx_spinlock_islocked
(
x
))
{
Sleep
(
0
);
}
}
#elif defined(__xlC__) && defined (_AIX)
/* IBM xlC compiler on AIX */
#include <sys/atomic_op.h>
#define gmx_atomic_memory_barrier()
typedef
struct
gmx_atomic
{
volatile
int
value
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t
;
typedef
struct
gmx_spinlock
{
volatile
unsigned
int
lock
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t
;
static
inline
int
gmx_atomic_cmpxchg
(
gmx_atomic_t
*
a
,
int
oldval
,
int
newval
)
{
int
t
;
if
(
__check_lock
((
atomic_p
)
&
a
->
value
,
oldval
,
newval
))
{
/* Not successful - value had changed in memory. Reload value. */
t
=
a
->
value
;
}
else
{
/* replacement suceeded */
t
=
oldval
;
}
return
t
;
}
static
inline
void
gmx_atomic_add_return
(
gmx_atomic_t
*
a
,
int
i
)
{
int
oldval
,
newval
;
do
{
oldval
=
gmx_atomic_read
(
a
);
newval
=
oldval
+
i
;
}
while
(
__check_lock
((
atomic_p
)
&
a
->
value
,
oldval
,
newval
));
return
newval
;
}
static
inline
void
gmx_atomic_fetch_add
(
gmx_atomic_t
*
a
,
int
i
)
{
int
oldval
,
newval
;
do
{
oldval
=
gmx_atomic_read
(
a
);
newval
=
oldval
+
i
;
}
while
(
__check_lock
((
atomic_p
)
&
a
->
value
,
oldval
,
newval
));
return
oldval
;
}
static
inline
void
gmx_spinlock_init
(
gmx_spinlock_t
*
x
)
{
__clear_lock
((
atomic_p
)
x
,
0
);
}
static
inline
void
gmx_spinlock_lock
(
gmx_spinlock_t
*
x
)
{
do
{
;
}
while
(
__check_lock
((
atomic_p
)
x
,
0
,
1
));
}
static
inline
void
gmx_spinlock_trylock
(
gmx_spinlock_t
*
x
)
{
/* Return 0 if we got the lock */
return
(
__check_lock
((
atomic_p
)
x
,
0
,
1
)
!=
0
)
}
static
inline
void
gmx_spinlock_unlock
(
gmx_spinlock_t
*
x
)
{
__clear_lock
((
atomic_p
)
x
,
0
);
}
static
inline
void
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
return
(
*
((
atomic_p
)
x
)
!=
0
);
}
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
while
(
gmx_spinlock_islocked
(
x
))
{
;
}
}
#else
/* No atomic operations, use mutex fallback. Documentation is in x86 section */
#define gmx_atomic_memory_barrier()
/* System mutex used for locking to guarantee atomicity */
static
pthread_mutex_t
gmx_atomic_mutex
=
PTHREAD_MUTEX_INITIALIZER
;
typedef
struct
gmx_atomic
{
int
value
;
}
gmx_atomic_t
;
#define gmx_spinlock_t pthread_mutex_t
# define GMX_SPINLOCK_INITIALIZER PTHREAD_MUTEX_INITIALIZER
/* Since mutexes guarantee memory barriers this works fine */
#define gmx_atomic_read(a) ((a)->value)
static
inline
void
gmx_atomic_set
(
gmx_atomic_t
*
a
,
int
i
)
{
/* Mutexes here are necessary to guarantee memory visibility */
pthread_mutex_lock
(
&
gmx_atomic_mutex
);
a
->
value
=
i
;
pthread_mutex_unlock
(
&
gmx_atomic_mutex
);
}
static
inline
int
gmx_atomic_add_return
(
gmx_atomic_t
*
a
,
int
i
)
{
int
t
;
pthread_mutex_lock
(
&
gmx_atomic_mutex
);
t
=
a
->
value
+
i
;
a
->
value
=
t
;
pthread_mutex_unlock
(
&
gmx_atomic_mutex
);
return
t
;
}
static
inline
int
gmx_atomic_fetch_add
(
gmx_atomic_t
*
a
,
int
i
)
{
int
old_value
;
pthread_mutex_lock
(
&
gmx_atomic_mutex
);
old_value
=
a
->
value
;
a
->
value
=
old_value
+
i
;
pthread_mutex_unlock
(
&
gmx_atomic_mutex
);
return
old_value
;
}
static
inline
int
gmx_atomic_cmpxchg
(
gmx_atomic_t
*
a
,
int
oldv
,
int
newv
)
{
int
t
;
pthread_mutex_lock
(
&
gmx_atomic_mutex
);
t
=
a
->
value
;
if
(
t
==
oldv
)
{
a
->
value
=
newv
;
}
pthread_mutex_unlock
(
&
gmx_atomic_mutex
);
return
t
;
}
#define gmx_spinlock_init(lock) pthread_mutex_init(lock)
#define gmx_spinlock_lock(lock) pthread_mutex_lock(lock)
#define gmx_spinlock_trylock(lock) pthread_mutex_trylock(lock)
#define gmx_spinlock_unlock(lock) pthread_mutex_unlock(lock)
static
inline
int
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
int
rc
;
if
(
gmx_spinlock_trylock
(
x
)
!=
0
)
{
/* It was locked */
return
1
;
}
else
{
/* We just locked it */
gmx_spinlock_unlock
(
x
);
return
0
;
}
}
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
int
rc
;
gmx_spinlock_lock
(
x
);
/* Got the lock now, so the waiting is over */
gmx_spinlock_unlock
(
x
);
}
#endif
/*! \brief Spinlock-based barrier type
*
* This barrier has the same functionality as the standard
* gmx_thread_barrier_t, but since it is based on spinlocks
* it provides faster synchronization at the cost of busy-waiting.
*
* Variables of this type should be initialized by calling
* gmx_spinlock_barrier_init() to set the number of threads
* that should be synchronized.
*/
typedef
struct
gmx_spinlock_barrier
{
gmx_atomic_t
count
;
/*!< Number of threads remaining */
int
threshold
;
/*!< Total number of threads */
volatile
int
cycle
;
/*!< Current cycle (alternating 0/1) */
}
gmx_spinlock_barrier_t
;
/*! \brief Initialize spinlock-based barrier
*
* \param barrier Pointer to _spinlock_ barrier. Note that this is not
* the same datatype as the full, thread based, barrier.
* \param count Number of threads to synchronize. All threads
* will be released after \a count calls to
* gmx_spinlock_barrier_wait().
*/
static
inline
void
gmx_spinlock_barrier_init
(
gmx_spinlock_barrier_t
*
barrier
,
int
count
)
{
barrier
->
threshold
=
count
;
barrier
->
cycle
=
0
;
gmx_atomic_set
(
&
(
barrier
->
count
),
count
);
}
/*! \brief Perform busy-waiting barrier synchronization
*
* This routine blocks until it has been called N times,
* where N is the count value the barrier was initialized with.
* After N total calls all threads return. The barrier automatically
* cycles, and thus requires another N calls to unblock another time.
*
* Note that spinlock-based barriers are completely different from
* standard ones (using mutexes and condition variables), only the
* functionality and names are similar.
*
* \param barrier Pointer to previously create barrier.
*
* \return The last thread returns -1, all the others 0.
*/
static
inline
int
gmx_spinlock_barrier_wait
(
gmx_spinlock_barrier_t
*
barrier
)
{
int
cycle
;
int
status
;
/* We don't need to lock or use atomic ops here, since the cycle index
* cannot change until after the last thread has performed the check
* further down. Further, they cannot reach this point in the next
* barrier iteration until all of them have been released, and that
* happens after the cycle value has been updated.
*
* No synchronization == fast synchronization.
*/
cycle
=
barrier
->
cycle
;
/* Decrement the count atomically and check if it is zero.
* This will only be true for the last thread calling us.
*/
if
(
gmx_atomic_add_return
(
&
(
barrier
->
count
),
-
1
)
==
0
)
{
gmx_atomic_set
(
&
(
barrier
->
count
),
barrier
->
threshold
);
barrier
->
cycle
=
!
barrier
->
cycle
;
status
=
-
1
;
}
else
{
/* Wait until the last thread changes the cycle index.
* We are both using a memory barrier, and explicit
* volatile pointer cast to make sure the compiler
* doesn't try to be smart and cache the contents.
*/
do
{
gmx_atomic_memory_barrier
();
}
while
(
*
(
volatile
int
*
)(
&
(
barrier
->
cycle
))
==
cycle
);
status
=
0
;
}
return
status
;
}
#ifdef __cplusplus
}
#endif
#endif
/* _GMX_ATOMIC_H_ */
platforms/cpu/tests/CMakeLists.txt
View file @
588a4c9d
...
@@ -11,7 +11,6 @@ IF( INCLUDE_SERIALIZATION )
...
@@ -11,7 +11,6 @@ IF( INCLUDE_SERIALIZATION )
INCLUDE_DIRECTORIES
(
${
OPENMM_DIR
}
/serialization/include
)
INCLUDE_DIRECTORIES
(
${
OPENMM_DIR
}
/serialization/include
)
SET
(
SHARED_OPENMM_SERIALIZATION
"OpenMMSerialization"
)
SET
(
SHARED_OPENMM_SERIALIZATION
"OpenMMSerialization"
)
ENDIF
(
INCLUDE_SERIALIZATION
)
ENDIF
(
INCLUDE_SERIALIZATION
)
GET_PROPERTY
(
COMPILE_FLAGS GLOBAL PROPERTY COMPILE_FLAGS
)
# Automatically create tests using files named "Test*.cpp"
# Automatically create tests using files named "Test*.cpp"
FILE
(
GLOB TEST_PROGS
"*Test*.cpp"
)
FILE
(
GLOB TEST_PROGS
"*Test*.cpp"
)
...
@@ -21,7 +20,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
...
@@ -21,7 +20,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
# Link with shared library
# Link with shared library
ADD_EXECUTABLE
(
${
TEST_ROOT
}
${
TEST_PROG
}
)
ADD_EXECUTABLE
(
${
TEST_ROOT
}
${
TEST_PROG
}
)
TARGET_LINK_LIBRARIES
(
${
TEST_ROOT
}
${
SHARED_TARGET
}
)
TARGET_LINK_LIBRARIES
(
${
TEST_ROOT
}
${
SHARED_TARGET
}
)
SET_TARGET_PROPERTIES
(
${
TEST_ROOT
}
PROPERTIES
COMPILE
_FLAGS
"
${
COMPILE_FLAGS
}
"
LINK
_FLAGS
"
${
COMPILE_FLAGS
}
"
)
SET_TARGET_PROPERTIES
(
${
TEST_ROOT
}
PROPERTIES
LINK
_FLAGS
"
${
EXTRA_
COMPILE_FLAGS
}
"
COMPILE
_FLAGS
"
${
EXTRA_
COMPILE_FLAGS
}
"
)
ADD_TEST
(
${
TEST_ROOT
}
${
EXECUTABLE_OUTPUT_PATH
}
/
${
TEST_ROOT
}
single
)
ADD_TEST
(
${
TEST_ROOT
}
${
EXECUTABLE_OUTPUT_PATH
}
/
${
TEST_ROOT
}
single
)
ENDFOREACH
(
TEST_PROG
${
TEST_PROGS
}
)
ENDFOREACH
(
TEST_PROG
${
TEST_PROGS
}
)
platforms/cpu/tests/TestCpuNeighborList.cpp
View file @
588a4c9d
...
@@ -35,6 +35,7 @@
...
@@ -35,6 +35,7 @@
#include "openmm/internal/AssertionUtilities.h"
#include "openmm/internal/AssertionUtilities.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/ThreadPool.h"
#include "AlignedArray.h"
#include "CpuNeighborList.h"
#include "CpuNeighborList.h"
#include "CpuPlatform.h"
#include "CpuPlatform.h"
#include "sfmt/SFMT.h"
#include "sfmt/SFMT.h"
...
@@ -52,7 +53,7 @@ void testNeighborList(bool periodic) {
...
@@ -52,7 +53,7 @@ void testNeighborList(bool periodic) {
const
float
boxSize
[
3
]
=
{
20.0
f
,
15.0
f
,
22.0
f
};
const
float
boxSize
[
3
]
=
{
20.0
f
,
15.0
f
,
22.0
f
};
OpenMM_SFMT
::
SFMT
sfmt
;
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
init_gen_rand
(
0
,
sfmt
);
vector
<
float
>
positions
(
4
*
numParticles
);
AlignedArray
<
float
>
positions
(
4
*
numParticles
);
for
(
int
i
=
0
;
i
<
4
*
numParticles
;
i
++
)
for
(
int
i
=
0
;
i
<
4
*
numParticles
;
i
++
)
if
(
i
%
4
<
3
)
if
(
i
%
4
<
3
)
positions
[
i
]
=
boxSize
[
i
%
4
]
*
genrand_real2
(
sfmt
);
positions
[
i
]
=
boxSize
[
i
%
4
]
*
genrand_real2
(
sfmt
);
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment