Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
8e656070
Commit
8e656070
authored
Dec 10, 2013
by
Lee-Ping Wang
Browse files
Merge branch 'master' of
https://github.com/SimTk/openmm
parents
6160b726
ff6af025
Changes
80
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2173 additions
and
212 deletions
+2173
-212
CMakeLists.txt
CMakeLists.txt
+14
-10
examples/CMakeLists.txt
examples/CMakeLists.txt
+17
-11
openmmapi/include/openmm/internal/vectorize.h
openmmapi/include/openmm/internal/vectorize.h
+12
-0
platforms/cpu/CMakeLists.txt
platforms/cpu/CMakeLists.txt
+0
-4
platforms/cpu/include/AlignedArray.h
platforms/cpu/include/AlignedArray.h
+104
-0
platforms/cpu/include/CpuGBSAOBCForce.h
platforms/cpu/include/CpuGBSAOBCForce.h
+6
-2
platforms/cpu/include/CpuNeighborList.h
platforms/cpu/include/CpuNeighborList.h
+2
-1
platforms/cpu/include/CpuNonbondedForce.h
platforms/cpu/include/CpuNonbondedForce.h
+5
-2
platforms/cpu/include/CpuPlatform.h
platforms/cpu/include/CpuPlatform.h
+3
-2
platforms/cpu/include/CpuSETTLE.h
platforms/cpu/include/CpuSETTLE.h
+78
-0
platforms/cpu/sharedTarget/CMakeLists.txt
platforms/cpu/sharedTarget/CMakeLists.txt
+2
-3
platforms/cpu/src/CpuGBSAOBCForce.cpp
platforms/cpu/src/CpuGBSAOBCForce.cpp
+49
-45
platforms/cpu/src/CpuKernels.cpp
platforms/cpu/src/CpuKernels.cpp
+6
-6
platforms/cpu/src/CpuNeighborList.cpp
platforms/cpu/src/CpuNeighborList.cpp
+38
-13
platforms/cpu/src/CpuNonbondedForce.cpp
platforms/cpu/src/CpuNonbondedForce.cpp
+125
-108
platforms/cpu/src/CpuPlatform.cpp
platforms/cpu/src/CpuPlatform.cpp
+9
-2
platforms/cpu/src/CpuSETTLE.cpp
platforms/cpu/src/CpuSETTLE.cpp
+104
-0
platforms/cpu/src/gmx_atomic.h
platforms/cpu/src/gmx_atomic.h
+1596
-0
platforms/cpu/tests/CMakeLists.txt
platforms/cpu/tests/CMakeLists.txt
+1
-2
platforms/cpu/tests/TestCpuNeighborList.cpp
platforms/cpu/tests/TestCpuNeighborList.cpp
+2
-1
No files found.
CMakeLists.txt
View file @
8e656070
...
@@ -105,16 +105,20 @@ ELSE( CMAKE_SIZEOF_VOID_P EQUAL 8 )
...
@@ -105,16 +105,20 @@ ELSE( CMAKE_SIZEOF_VOID_P EQUAL 8 )
SET
(
LIB64
)
SET
(
LIB64
)
ENDIF
(
CMAKE_SIZEOF_VOID_P EQUAL 8
)
ENDIF
(
CMAKE_SIZEOF_VOID_P EQUAL 8
)
# Build universal binaries compatible with OS X 10.7
IF
(
APPLE AND NOT CMAKE_OSX_DEPLOYMENT_TARGET
)
SET
(
CMAKE_OSX_ARCHITECTURES
"i386;x86_64"
CACHE STRING
"The processor architectures to build for"
FORCE
)
SET
(
CMAKE_OSX_DEPLOYMENT_TARGET
"10.7"
CACHE STRING
"The minimum version of OS X to support"
FORCE
)
ENDIF
(
APPLE AND NOT CMAKE_OSX_DEPLOYMENT_TARGET
)
# Improve the linking behavior of Mac libraries
IF
(
APPLE
)
IF
(
APPLE
)
# Build universal binaries compatible with OS X 10.7
IF
(
NOT CMAKE_OSX_DEPLOYMENT_TARGET
)
SET
(
CMAKE_OSX_DEPLOYMENT_TARGET
"10.7"
CACHE STRING
"The minimum version of OS X to support"
FORCE
)
ENDIF
(
NOT CMAKE_OSX_DEPLOYMENT_TARGET
)
IF
(
NOT CMAKE_OSX_ARCHITECTURES
)
SET
(
CMAKE_OSX_ARCHITECTURES
"i386;x86_64"
CACHE STRING
"The processor architectures to build for"
FORCE
)
ENDIF
(
NOT CMAKE_OSX_ARCHITECTURES
)
# Improve the linking behavior of Mac libraries
SET
(
CMAKE_INSTALL_NAME_DIR
"@rpath"
)
SET
(
CMAKE_INSTALL_NAME_DIR
"@rpath"
)
SET_PROPERTY
(
GLOBAL PROPERTY COMPILE_FLAGS
"-stdlib=libc++ -mmacosx-version-min=10.7"
)
SET
(
EXTRA_COMPILE_FLAGS
"-stdlib=libc++"
)
ELSE
(
APPLE
)
SET
(
EXTRA_COMPILE_FLAGS
)
ENDIF
(
APPLE
)
ENDIF
(
APPLE
)
IF
(
UNIX AND NOT CMAKE_BUILD_TYPE
)
IF
(
UNIX AND NOT CMAKE_BUILD_TYPE
)
...
@@ -276,7 +280,7 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
...
@@ -276,7 +280,7 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/src
)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/src
)
ADD_LIBRARY
(
${
SHARED_TARGET
}
SHARED
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
ADD_LIBRARY
(
${
SHARED_TARGET
}
SHARED
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
SET_TARGET_PROPERTIES
(
${
SHARED_TARGET
}
PROPERTIES
COMPILE_FLAGS
"
-DOPENMM_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY -DOPENMM_VALIDATE_BUILDING_SHARED_LIBRARY"
)
SET_TARGET_PROPERTIES
(
${
SHARED_TARGET
}
PROPERTIES
LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-DOPENMM_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY -DOPENMM_VALIDATE_BUILDING_SHARED_LIBRARY"
)
IF
(
WIN32
)
IF
(
WIN32
)
ADD_DEPENDENCIES
(
${
SHARED_TARGET
}
PthreadsLibraries
)
ADD_DEPENDENCIES
(
${
SHARED_TARGET
}
PthreadsLibraries
)
ENDIF
(
WIN32
)
ENDIF
(
WIN32
)
...
@@ -284,7 +288,7 @@ ENDIF(WIN32)
...
@@ -284,7 +288,7 @@ ENDIF(WIN32)
SET
(
OPENMM_BUILD_STATIC_LIB OFF CACHE BOOL
"Whether to build static OpenMM libraries"
)
SET
(
OPENMM_BUILD_STATIC_LIB OFF CACHE BOOL
"Whether to build static OpenMM libraries"
)
IF
(
OPENMM_BUILD_STATIC_LIB
)
IF
(
OPENMM_BUILD_STATIC_LIB
)
ADD_LIBRARY
(
${
STATIC_TARGET
}
STATIC
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
ADD_LIBRARY
(
${
STATIC_TARGET
}
STATIC
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
SET_TARGET_PROPERTIES
(
${
STATIC_TARGET
}
PROPERTIES
COMPILE_FLAGS
"
-DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY -DOPENMMM_VALIDATE_BUILDING_STATIC_LIBRARY -DOPENMM_VALIDATE_BUILDING_STATIC_LIBRARY"
)
SET_TARGET_PROPERTIES
(
${
STATIC_TARGET
}
PROPERTIES
LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY -DOPENMMM_VALIDATE_BUILDING_STATIC_LIBRARY -DOPENMM_VALIDATE_BUILDING_STATIC_LIBRARY"
)
ENDIF
(
OPENMM_BUILD_STATIC_LIB
)
ENDIF
(
OPENMM_BUILD_STATIC_LIB
)
IF
(
OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS
)
IF
(
OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS
)
...
...
examples/CMakeLists.txt
View file @
8e656070
...
@@ -26,14 +26,16 @@ SET(BUILD_TESTING_STATIC OFF)
...
@@ -26,14 +26,16 @@ SET(BUILD_TESTING_STATIC OFF)
IF
(
OPENMM_BUILD_STATIC_LIB
)
IF
(
OPENMM_BUILD_STATIC_LIB
)
SET
(
BUILD_TESTING_STATIC ON
)
SET
(
BUILD_TESTING_STATIC ON
)
ENDIF
(
OPENMM_BUILD_STATIC_LIB
)
ENDIF
(
OPENMM_BUILD_STATIC_LIB
)
FOREACH
(
EX_ROOT
${
CPP_EXAMPLES
}
)
FOREACH
(
EX_ROOT
${
CPP_EXAMPLES
}
)
IF
(
BUILD_TESTING_SHARED
)
IF
(
BUILD_TESTING_SHARED
)
# Link with shared library
# Link with shared library
ADD_EXECUTABLE
(
${
EX_ROOT
}
${
EX_ROOT
}
.cpp
)
ADD_EXECUTABLE
(
${
EX_ROOT
}
${
EX_ROOT
}
.cpp
)
SET_TARGET_PROPERTIES
(
${
EX_ROOT
}
SET_TARGET_PROPERTIES
(
${
EX_ROOT
}
PROPERTIES
PROPERTIES
PROJECT_LABEL
"Example -
${
EX_ROOT
}
"
)
PROJECT_LABEL
"Example -
${
EX_ROOT
}
"
LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
)
TARGET_LINK_LIBRARIES
(
${
EX_ROOT
}
${
SHARED_TARGET
}
)
TARGET_LINK_LIBRARIES
(
${
EX_ROOT
}
${
SHARED_TARGET
}
)
ENDIF
(
BUILD_TESTING_SHARED
)
ENDIF
(
BUILD_TESTING_SHARED
)
...
@@ -42,9 +44,10 @@ FOREACH(EX_ROOT ${CPP_EXAMPLES})
...
@@ -42,9 +44,10 @@ FOREACH(EX_ROOT ${CPP_EXAMPLES})
SET
(
EX_STATIC
${
EX_ROOT
}
Static
)
SET
(
EX_STATIC
${
EX_ROOT
}
Static
)
ADD_EXECUTABLE
(
${
EX_STATIC
}
${
EX_ROOT
}
.cpp
)
ADD_EXECUTABLE
(
${
EX_STATIC
}
${
EX_ROOT
}
.cpp
)
SET_TARGET_PROPERTIES
(
${
EX_STATIC
}
SET_TARGET_PROPERTIES
(
${
EX_STATIC
}
PROPERTIES
PROPERTIES
COMPILE_FLAGS
"-DOPENMM_USE_STATIC_LIBRARIES"
PROJECT_LABEL
"Example -
${
EX_STATIC
}
"
PROJECT_LABEL
"Example -
${
EX_STATIC
}
"
)
LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-DOPENMM_USE_STATIC_LIBRARIES"
)
TARGET_LINK_LIBRARIES
(
${
EX_STATIC
}
${
STATIC_TARGET
}
)
TARGET_LINK_LIBRARIES
(
${
EX_STATIC
}
${
STATIC_TARGET
}
)
ENDIF
(
BUILD_TESTING_STATIC
)
ENDIF
(
BUILD_TESTING_STATIC
)
...
@@ -63,8 +66,10 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
...
@@ -63,8 +66,10 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
# C++ libraries on the link line.
# C++ libraries on the link line.
ADD_EXECUTABLE
(
${
EX_ROOT
}
${
EX_ROOT
}
.c Empty.cpp
)
ADD_EXECUTABLE
(
${
EX_ROOT
}
${
EX_ROOT
}
.c Empty.cpp
)
SET_TARGET_PROPERTIES
(
${
EX_ROOT
}
SET_TARGET_PROPERTIES
(
${
EX_ROOT
}
PROPERTIES
PROPERTIES
PROJECT_LABEL
"Example C -
${
EX_ROOT
}
"
)
PROJECT_LABEL
"Example C -
${
EX_ROOT
}
"
LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
)
TARGET_LINK_LIBRARIES
(
${
EX_ROOT
}
${
SHARED_TARGET
}
)
TARGET_LINK_LIBRARIES
(
${
EX_ROOT
}
${
SHARED_TARGET
}
)
ADD_DEPENDENCIES
(
${
EX_ROOT
}
ApiWrappers
)
ADD_DEPENDENCIES
(
${
EX_ROOT
}
ApiWrappers
)
ENDIF
(
BUILD_TESTING_SHARED
)
ENDIF
(
BUILD_TESTING_SHARED
)
...
@@ -76,9 +81,10 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
...
@@ -76,9 +81,10 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
# C++ libraries on the static link line.
# C++ libraries on the static link line.
ADD_EXECUTABLE
(
${
EX_STATIC
}
${
EX_ROOT
}
.c Empty.cpp
)
ADD_EXECUTABLE
(
${
EX_STATIC
}
${
EX_ROOT
}
.c Empty.cpp
)
SET_TARGET_PROPERTIES
(
${
EX_STATIC
}
SET_TARGET_PROPERTIES
(
${
EX_STATIC
}
PROPERTIES
PROPERTIES
COMPILE_FLAGS
"-DOPENMM_USE_STATIC_LIBRARIES"
PROJECT_LABEL
"Example C -
${
EX_STATIC
}
"
PROJECT_LABEL
"Example C -
${
EX_STATIC
}
"
)
LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-DOPENMM_USE_STATIC_LIBRARIES"
)
TARGET_LINK_LIBRARIES
(
${
EX_STATIC
}
${
STATIC_TARGET
}
)
TARGET_LINK_LIBRARIES
(
${
EX_STATIC
}
${
STATIC_TARGET
}
)
ADD_DEPENDENCIES
(
${
EX_STATIC
}
ApiWrappers
)
ADD_DEPENDENCIES
(
${
EX_STATIC
}
ApiWrappers
)
ENDIF
(
BUILD_TESTING_STATIC
)
ENDIF
(
BUILD_TESTING_STATIC
)
...
...
openmmapi/include/openmm/internal/vectorize.h
View file @
8e656070
...
@@ -91,6 +91,9 @@ public:
...
@@ -91,6 +91,9 @@ public:
fvec4
operator
&
(
fvec4
other
)
const
{
fvec4
operator
&
(
fvec4
other
)
const
{
return
_mm_and_ps
(
val
,
other
);
return
_mm_and_ps
(
val
,
other
);
}
}
fvec4
operator
|
(
fvec4
other
)
const
{
return
_mm_or_ps
(
val
,
other
);
}
fvec4
operator
==
(
fvec4
other
)
const
{
fvec4
operator
==
(
fvec4
other
)
const
{
return
_mm_cmpeq_ps
(
val
,
other
);
return
_mm_cmpeq_ps
(
val
,
other
);
}
}
...
@@ -157,6 +160,9 @@ public:
...
@@ -157,6 +160,9 @@ public:
ivec4
operator
&
(
ivec4
other
)
const
{
ivec4
operator
&
(
ivec4
other
)
const
{
return
_mm_and_si128
(
val
,
other
);
return
_mm_and_si128
(
val
,
other
);
}
}
ivec4
operator
|
(
ivec4
other
)
const
{
return
_mm_or_si128
(
val
,
other
);
}
ivec4
operator
==
(
ivec4
other
)
const
{
ivec4
operator
==
(
ivec4
other
)
const
{
return
_mm_cmpeq_epi32
(
val
,
other
);
return
_mm_cmpeq_epi32
(
val
,
other
);
}
}
...
@@ -267,5 +273,11 @@ static inline fvec4 operator/(float v1, fvec4 v2) {
...
@@ -267,5 +273,11 @@ static inline fvec4 operator/(float v1, fvec4 v2) {
return
fvec4
(
v1
)
/
v2
;
return
fvec4
(
v1
)
/
v2
;
}
}
// Operations for blending fvec4s based on an ivec4.
static
inline
fvec4
blend
(
fvec4
v1
,
fvec4
v2
,
ivec4
mask
)
{
return
fvec4
(
_mm_blendv_ps
(
v1
.
val
,
v2
.
val
,
_mm_castsi128_ps
(
mask
.
val
)));
}
#endif
/*OPENMM_VECTORIZE_H_*/
#endif
/*OPENMM_VECTORIZE_H_*/
platforms/cpu/CMakeLists.txt
View file @
8e656070
...
@@ -14,10 +14,6 @@
...
@@ -14,10 +14,6 @@
# libOpenMMCPU_static[_d].a
# libOpenMMCPU_static[_d].a
#----------------------------------------------------
#----------------------------------------------------
IF
(
APPLE
)
SET
(
CMAKE_OSX_DEPLOYMENT_TARGET
"10.6"
)
ENDIF
(
APPLE
)
SUBDIRS
(
tests
)
SUBDIRS
(
tests
)
# The source is organized into subdirectories, but we handle them all from
# The source is organized into subdirectories, but we handle them all from
...
...
platforms/cpu/include/AlignedArray.h
0 → 100644
View file @
8e656070
#ifndef OPENMM_ALIGNEDARRAY_H_
#define OPENMM_ALIGNEDARRAY_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
namespace
OpenMM
{
/**
* This class represents an array in memory whose starting point is guaranteed to
* be aligned with a 16 byte boundary. This can improve the performance of vectorized
* code, since loads and stores are more efficient.
*/
template
<
class
T
>
class
AlignedArray
{
public:
/**
* Default constructor, to allow AlignedArrays to be used inside collections.
*/
AlignedArray
()
:
dataSize
(
0
),
baseData
(
0
),
data
(
0
)
{
}
/**
* Create an Aligned array that contains a specified number of elements.
*/
AlignedArray
(
int
size
)
{
allocate
(
size
);
}
~
AlignedArray
()
{
if
(
baseData
!=
0
)
delete
[]
baseData
;
}
/**
* Get the number of elements in the array.
*/
int
size
()
const
{
return
dataSize
;
}
/**
* Change the size of the array. This may cause all contents to be lost.
*/
void
resize
(
int
size
)
{
if
(
dataSize
==
size
)
return
;
if
(
baseData
!=
0
)
delete
[]
baseData
;
allocate
(
size
);
}
/**
* Get a reference to an element of the array.
*/
T
&
operator
[](
int
i
)
{
return
data
[
i
];
}
/**
* Get a const reference to an element of the array.
*/
const
T
&
operator
[](
int
i
)
const
{
return
data
[
i
];
}
private:
void
allocate
(
int
size
)
{
dataSize
=
size
;
baseData
=
new
char
[
size
*
sizeof
(
T
)
+
16
];
char
*
offsetData
=
baseData
+
15
;
offsetData
-=
(
long
long
)
offsetData
&
0xF
;
data
=
(
T
*
)
offsetData
;
}
int
dataSize
;
char
*
baseData
;
T
*
data
;
};
}
// namespace OpenMM
#endif
/*OPENMM_ALIGNEDARRAY_H_*/
platforms/cpu/include/CpuGBSAOBCForce.h
View file @
8e656070
...
@@ -25,6 +25,7 @@
...
@@ -25,6 +25,7 @@
#ifndef OPENMM_CPU_GBSAOBC_FORCE_H__
#ifndef OPENMM_CPU_GBSAOBC_FORCE_H__
#define OPENMM_CPU_GBSAOBC_FORCE_H__
#define OPENMM_CPU_GBSAOBC_FORCE_H__
#include "AlignedArray.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/vectorize.h"
#include "openmm/internal/vectorize.h"
#include <set>
#include <set>
...
@@ -84,7 +85,7 @@ public:
...
@@ -84,7 +85,7 @@ public:
* @param totalEnergy total energy
* @param totalEnergy total energy
* @param threads the thread pool to use
* @param threads the thread pool to use
*/
*/
void
computeForce
(
const
std
::
vector
<
float
>&
posq
,
std
::
vector
<
std
::
vector
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
);
void
computeForce
(
const
AlignedArray
<
float
>&
posq
,
std
::
vector
<
AlignedArray
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
);
/**
/**
* This routine contains the code executed by each thread.
* This routine contains the code executed by each thread.
...
@@ -105,10 +106,13 @@ private:
...
@@ -105,10 +106,13 @@ private:
float
logDX
,
logDXInv
;
float
logDX
,
logDXInv
;
// The following variables are used to make information accessible to the individual threads.
// The following variables are used to make information accessible to the individual threads.
float
const
*
posq
;
float
const
*
posq
;
std
::
vector
<
std
::
vector
<
float
>
>*
threadForce
;
std
::
vector
<
AlignedArray
<
float
>
>*
threadForce
;
bool
includeEnergy
;
bool
includeEnergy
;
void
*
atomicCounter
;
static
const
int
NUM_TABLE_POINTS
;
static
const
int
NUM_TABLE_POINTS
;
static
const
float
TABLE_MIN
;
static
const
float
TABLE_MAX
;
/**
/**
* Compute the displacement and squared distance between a collection of points, optionally using
* Compute the displacement and squared distance between a collection of points, optionally using
...
...
platforms/cpu/include/CpuNeighborList.h
View file @
8e656070
...
@@ -32,6 +32,7 @@
...
@@ -32,6 +32,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
* -------------------------------------------------------------------------- */
#include "AlignedArray.h"
#include "windowsExportCpu.h"
#include "windowsExportCpu.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/ThreadPool.h"
#include <set>
#include <set>
...
@@ -46,7 +47,7 @@ public:
...
@@ -46,7 +47,7 @@ public:
class
Voxels
;
class
Voxels
;
static
const
int
BlockSize
;
static
const
int
BlockSize
;
CpuNeighborList
();
CpuNeighborList
();
void
computeNeighborList
(
int
numAtoms
,
const
std
::
vector
<
float
>&
atomLocations
,
const
std
::
vector
<
std
::
set
<
int
>
>&
exclusions
,
void
computeNeighborList
(
int
numAtoms
,
const
AlignedArray
<
float
>&
atomLocations
,
const
std
::
vector
<
std
::
set
<
int
>
>&
exclusions
,
const
float
*
periodicBoxSize
,
bool
usePeriodic
,
float
maxDistance
,
ThreadPool
&
threads
);
const
float
*
periodicBoxSize
,
bool
usePeriodic
,
float
maxDistance
,
ThreadPool
&
threads
);
int
getNumBlocks
()
const
;
int
getNumBlocks
()
const
;
const
std
::
vector
<
int
>&
getSortedAtoms
()
const
;
const
std
::
vector
<
int
>&
getSortedAtoms
()
const
;
...
...
platforms/cpu/include/CpuNonbondedForce.h
View file @
8e656070
...
@@ -25,6 +25,7 @@
...
@@ -25,6 +25,7 @@
#ifndef OPENMM_CPU_NONBONDED_FORCE_H__
#ifndef OPENMM_CPU_NONBONDED_FORCE_H__
#define OPENMM_CPU_NONBONDED_FORCE_H__
#define OPENMM_CPU_NONBONDED_FORCE_H__
#include "AlignedArray.h"
#include "CpuNeighborList.h"
#include "CpuNeighborList.h"
#include "ReferencePairIxn.h"
#include "ReferencePairIxn.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/ThreadPool.h"
...
@@ -143,7 +144,7 @@ class CpuNonbondedForce {
...
@@ -143,7 +144,7 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
void
calculateDirectIxn
(
int
numberOfAtoms
,
float
*
posq
,
const
std
::
vector
<
RealVec
>&
atomCoordinates
,
const
std
::
vector
<
std
::
pair
<
float
,
float
>
>&
atomParameters
,
void
calculateDirectIxn
(
int
numberOfAtoms
,
float
*
posq
,
const
std
::
vector
<
RealVec
>&
atomCoordinates
,
const
std
::
vector
<
std
::
pair
<
float
,
float
>
>&
atomParameters
,
const
std
::
vector
<
std
::
set
<
int
>
>&
exclusions
,
std
::
vector
<
std
::
vector
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
);
const
std
::
vector
<
std
::
set
<
int
>
>&
exclusions
,
std
::
vector
<
AlignedArray
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
);
/**
/**
* This routine contains the code executed by each thread.
* This routine contains the code executed by each thread.
...
@@ -156,6 +157,7 @@ private:
...
@@ -156,6 +157,7 @@ private:
bool
periodic
;
bool
periodic
;
bool
ewald
;
bool
ewald
;
bool
pme
;
bool
pme
;
bool
tableIsValid
;
const
CpuNeighborList
*
neighborList
;
const
CpuNeighborList
*
neighborList
;
float
periodicBoxSize
[
3
];
float
periodicBoxSize
[
3
];
float
cutoffDistance
,
switchingDistance
;
float
cutoffDistance
,
switchingDistance
;
...
@@ -172,8 +174,9 @@ private:
...
@@ -172,8 +174,9 @@ private:
RealVec
const
*
atomCoordinates
;
RealVec
const
*
atomCoordinates
;
std
::
pair
<
float
,
float
>
const
*
atomParameters
;
std
::
pair
<
float
,
float
>
const
*
atomParameters
;
std
::
set
<
int
>
const
*
exclusions
;
std
::
set
<
int
>
const
*
exclusions
;
std
::
vector
<
std
::
vector
<
float
>
>*
threadForce
;
std
::
vector
<
AlignedArray
<
float
>
>*
threadForce
;
bool
includeEnergy
;
bool
includeEnergy
;
void
*
atomicCounter
;
static
const
float
TWO_OVER_SQRT_PI
;
static
const
float
TWO_OVER_SQRT_PI
;
static
const
int
NUM_TABLE_POINTS
;
static
const
int
NUM_TABLE_POINTS
;
...
...
platforms/cpu/include/CpuPlatform.h
View file @
8e656070
...
@@ -32,6 +32,7 @@
...
@@ -32,6 +32,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
* -------------------------------------------------------------------------- */
#include "AlignedArray.h"
#include "ReferencePlatform.h"
#include "ReferencePlatform.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/ThreadPool.h"
...
@@ -69,8 +70,8 @@ private:
...
@@ -69,8 +70,8 @@ private:
class
CpuPlatform
::
PlatformData
{
class
CpuPlatform
::
PlatformData
{
public:
public:
PlatformData
(
int
numParticles
);
PlatformData
(
int
numParticles
);
std
::
vector
<
float
>
posq
;
AlignedArray
<
float
>
posq
;
std
::
vector
<
std
::
vector
<
float
>
>
threadForce
;
std
::
vector
<
AlignedArray
<
float
>
>
threadForce
;
ThreadPool
threads
;
ThreadPool
threads
;
bool
isPeriodic
;
bool
isPeriodic
;
};
};
...
...
platforms/cpu/include/CpuSETTLE.h
0 → 100644
View file @
8e656070
#ifndef OPENMM_CPUSETTLE_H_
#define OPENMM_CPUSETTLE_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "ReferenceSETTLEAlgorithm.h"
#include "openmm/System.h"
#include "openmm/internal/ThreadPool.h"
#include <vector>
namespace
OpenMM
{
/**
* This class uses multiple ReferenceSETTLEAlgorithm objects to execute the algorithm in parallel.
*/
class
OPENMM_EXPORT
CpuSETTLE
:
public
ReferenceConstraintAlgorithm
{
public:
class
ApplyToPositionsTask
;
class
ApplyToVelocitiesTask
;
CpuSETTLE
(
const
System
&
system
,
const
ReferenceSETTLEAlgorithm
&
settle
,
ThreadPool
&
threads
);
~
CpuSETTLE
();
/**
* Apply the constraint algorithm.
*
* @param atomCoordinates the original atom coordinates
* @param atomCoordinatesP the new atom coordinates
* @param inverseMasses 1/mass
* @param tolerance the constraint tolerance
*/
void
apply
(
std
::
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
std
::
vector
<
OpenMM
::
RealVec
>&
atomCoordinatesP
,
std
::
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
);
/**
* Apply the constraint algorithm to velocities.
*
* @param atomCoordinates the atom coordinates
* @param atomCoordinatesP the velocities to modify
* @param inverseMasses 1/mass
* @param tolerance the constraint tolerance
*/
void
applyToVelocities
(
std
::
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
std
::
vector
<
OpenMM
::
RealVec
>&
velocities
,
std
::
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
);
private:
std
::
vector
<
ReferenceSETTLEAlgorithm
*>
threadSettle
;
ThreadPool
&
threads
;
};
}
// namespace OpenMM
#endif
/*OPENMM_CPUSETTLE_H_*/
platforms/cpu/sharedTarget/CMakeLists.txt
View file @
8e656070
GET_PROPERTY
(
COMPILE_FLAGS GLOBAL PROPERTY COMPILE_FLAGS
)
SET_SOURCE_FILES_PROPERTIES
(
${
SOURCE_FILES
}
PROPERTIES COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-msse4.1"
)
SET_SOURCE_FILES_PROPERTIES
(
${
SOURCE_FILES
}
PROPERTIES COMPILE_FLAGS
"
${
COMPILE_FLAGS
}
-msse4.1"
)
ADD_LIBRARY
(
${
SHARED_TARGET
}
SHARED
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
ADD_LIBRARY
(
${
SHARED_TARGET
}
SHARED
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
IF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
IF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
...
@@ -8,6 +7,6 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
...
@@ -8,6 +7,6 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET
(
MAIN_OPENMM_LIB
${
OPENMM_LIBRARY_NAME
}
)
SET
(
MAIN_OPENMM_LIB
${
OPENMM_LIBRARY_NAME
}
)
ENDIF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
ENDIF
(
UNIX AND CMAKE_BUILD_TYPE MATCHES Debug
)
TARGET_LINK_LIBRARIES
(
${
SHARED_TARGET
}
${
MAIN_OPENMM_LIB
}
${
PTHREADS_LIB
}
)
TARGET_LINK_LIBRARIES
(
${
SHARED_TARGET
}
${
MAIN_OPENMM_LIB
}
${
PTHREADS_LIB
}
)
SET_TARGET_PROPERTIES
(
${
SHARED_TARGET
}
PROPERTIES
COMPILE_FLAGS
"
-DOPENMM_CPU_BUILDING_SHARED_LIBRARY"
LINK_FLAGS
"
${
COMPILE_FLAGS
}
"
)
SET_TARGET_PROPERTIES
(
${
SHARED_TARGET
}
PROPERTIES
LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
"
COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-DOPENMM_CPU_BUILDING_SHARED_LIBRARY"
)
INSTALL_TARGETS
(
/lib/plugins RUNTIME_DIRECTORY /lib/plugins
${
SHARED_TARGET
}
)
INSTALL_TARGETS
(
/lib/plugins RUNTIME_DIRECTORY /lib/plugins
${
SHARED_TARGET
}
)
platforms/cpu/src/CpuGBSAOBCForce.cpp
View file @
8e656070
...
@@ -24,14 +24,16 @@
...
@@ -24,14 +24,16 @@
#include "CpuGBSAOBCForce.h"
#include "CpuGBSAOBCForce.h"
#include "SimTKOpenMMRealType.h"
#include "SimTKOpenMMRealType.h"
#include "openmm/internal/SplineFitter.h"
#include "openmm/internal/vectorize.h"
#include "openmm/internal/vectorize.h"
#include "gmx_atomic.h"
#include <cmath>
#include <cmath>
using
namespace
std
;
using
namespace
std
;
using
namespace
OpenMM
;
using
namespace
OpenMM
;
const
int
CpuGBSAOBCForce
::
NUM_TABLE_POINTS
=
1025
;
const
int
CpuGBSAOBCForce
::
NUM_TABLE_POINTS
=
4096
;
const
float
CpuGBSAOBCForce
::
TABLE_MIN
=
0.25
f
;
const
float
CpuGBSAOBCForce
::
TABLE_MAX
=
1.5
f
;
class
CpuGBSAOBCForce
::
ComputeTask
:
public
ThreadPool
::
Task
{
class
CpuGBSAOBCForce
::
ComputeTask
:
public
ThreadPool
::
Task
{
public:
public:
...
@@ -44,22 +46,12 @@ public:
...
@@ -44,22 +46,12 @@ public:
};
};
CpuGBSAOBCForce
::
CpuGBSAOBCForce
()
:
cutoff
(
false
),
periodic
(
false
)
{
CpuGBSAOBCForce
::
CpuGBSAOBCForce
()
:
cutoff
(
false
),
periodic
(
false
)
{
logDX
=
0.5
/
NUM_TABLE_POINTS
;
logDX
=
(
TABLE_MAX
-
TABLE_MIN
)
/
NUM_TABLE_POINTS
;
logDXInv
=
1.0
f
/
logDX
;
logDXInv
=
1.0
f
/
logDX
;
vector
<
double
>
x
(
NUM_TABLE_POINTS
+
1
);
logTable
.
resize
(
NUM_TABLE_POINTS
+
4
);
vector
<
double
>
y
(
NUM_TABLE_POINTS
+
1
);
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
+
4
;
i
++
)
{
vector
<
double
>
deriv
;
double
x
=
TABLE_MIN
+
i
*
logDX
;
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
+
1
;
i
++
)
{
logTable
[
i
]
=
log
(
x
);
x
[
i
]
=
0.5
+
i
*
0.5
/
NUM_TABLE_POINTS
;
y
[
i
]
=
log
(
x
[
i
]);
}
SplineFitter
::
createNaturalSpline
(
x
,
y
,
deriv
);
logTable
.
resize
(
4
*
NUM_TABLE_POINTS
);
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
;
i
++
)
{
logTable
[
4
*
i
]
=
(
float
)
y
[
i
];
logTable
[
4
*
i
+
1
]
=
(
float
)
y
[
i
+
1
];
logTable
[
4
*
i
+
2
]
=
(
float
)
(
deriv
[
i
]
*
logDX
*
logDX
/
6
);
logTable
[
4
*
i
+
3
]
=
(
float
)
(
deriv
[
i
+
1
]
*
logDX
*
logDX
/
6
);
}
}
}
}
...
@@ -93,7 +85,7 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f
...
@@ -93,7 +85,7 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f
obcChain
.
resize
(
params
.
size
()
+
3
);
obcChain
.
resize
(
params
.
size
()
+
3
);
}
}
void
CpuGBSAOBCForce
::
computeForce
(
const
std
::
vector
<
float
>&
posq
,
vector
<
vector
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
)
{
void
CpuGBSAOBCForce
::
computeForce
(
const
AlignedArray
<
float
>&
posq
,
vector
<
AlignedArray
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
)
{
// Record the parameters for the threads.
// Record the parameters for the threads.
this
->
posq
=
&
posq
[
0
];
this
->
posq
=
&
posq
[
0
];
...
@@ -104,16 +96,22 @@ void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector
...
@@ -104,16 +96,22 @@ void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector
threadBornForces
.
resize
(
numThreads
);
threadBornForces
.
resize
(
numThreads
);
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
threadBornForces
[
i
].
resize
(
particleParams
.
size
()
+
3
);
threadBornForces
[
i
].
resize
(
particleParams
.
size
()
+
3
);
gmx_atomic_t
counter
;
this
->
atomicCounter
=
&
counter
;
// Signal the threads to start running and wait for them to finish.
// Signal the threads to start running and wait for them to finish.
ComputeTask
task
(
*
this
);
ComputeTask
task
(
*
this
);
gmx_atomic_set
(
&
counter
,
0
);
threads
.
execute
(
task
);
threads
.
execute
(
task
);
threads
.
waitForThreads
();
// Compute Born radii
threads
.
waitForThreads
();
// Compute Born radii
gmx_atomic_set
(
&
counter
,
0
);
threads
.
resumeThreads
();
threads
.
resumeThreads
();
threads
.
waitForThreads
();
// Compute surface area term
threads
.
waitForThreads
();
// Compute surface area term
gmx_atomic_set
(
&
counter
,
0
);
threads
.
resumeThreads
();
threads
.
resumeThreads
();
threads
.
waitForThreads
();
// First loop
threads
.
waitForThreads
();
// First loop
gmx_atomic_set
(
&
counter
,
0
);
threads
.
resumeThreads
();
threads
.
resumeThreads
();
threads
.
waitForThreads
();
// Second loop
threads
.
waitForThreads
();
// Second loop
...
@@ -141,8 +139,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
...
@@ -141,8 +139,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
// Calculate Born radii
// Calculate Born radii
for
(
int
blockStart
=
start
;
blockStart
<
end
;
blockStart
+=
4
)
{
while
(
true
)
{
int
numInBlock
=
min
(
4
,
end
-
blockStart
);
int
blockStart
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
4
);
if
(
blockStart
>=
numParticles
)
break
;
int
numInBlock
=
min
(
4
,
numParticles
-
blockStart
);
ivec4
blockAtomIndex
(
blockStart
,
blockStart
+
1
,
blockStart
+
2
,
blockStart
+
3
);
ivec4
blockAtomIndex
(
blockStart
,
blockStart
+
1
,
blockStart
+
2
,
blockStart
+
3
);
float
atomRadius
[
4
],
atomx
[
4
],
atomy
[
4
],
atomz
[
4
];
float
atomRadius
[
4
],
atomx
[
4
],
atomy
[
4
],
atomz
[
4
];
int
blockMask
[
4
]
=
{
0
,
0
,
0
,
0
};
int
blockMask
[
4
]
=
{
0
,
0
,
0
,
0
};
...
@@ -213,7 +214,10 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
...
@@ -213,7 +214,10 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
vector
<
float
>&
bornForces
=
threadBornForces
[
threadIndex
];
vector
<
float
>&
bornForces
=
threadBornForces
[
threadIndex
];
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
bornForces
[
i
]
=
0.0
f
;
bornForces
[
i
]
=
0.0
f
;
for
(
int
atomI
=
start
;
atomI
<
end
;
atomI
++
)
{
while
(
true
)
{
int
atomI
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
1
);
if
(
atomI
>=
numParticles
)
break
;
if
(
bornRadii
[
atomI
]
>
0
)
{
if
(
bornRadii
[
atomI
]
>
0
)
{
float
radiusI
=
particleParams
[
atomI
].
first
+
dielectricOffset
;
float
radiusI
=
particleParams
[
atomI
].
first
+
dielectricOffset
;
float
r
=
radiusI
+
probeRadius
;
float
r
=
radiusI
+
probeRadius
;
...
@@ -235,8 +239,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
...
@@ -235,8 +239,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
preFactor
=
ONE_4PI_EPS0
*
((
1.0
f
/
solventDielectric
)
-
(
1.0
f
/
soluteDielectric
));
preFactor
=
ONE_4PI_EPS0
*
((
1.0
f
/
solventDielectric
)
-
(
1.0
f
/
soluteDielectric
));
else
else
preFactor
=
0.0
f
;
preFactor
=
0.0
f
;
for
(
int
blockStart
=
start
;
blockStart
<
end
;
blockStart
+=
4
)
{
while
(
true
)
{
int
numInBlock
=
min
(
4
,
end
-
blockStart
);
int
blockStart
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
4
);
if
(
blockStart
>=
numParticles
)
break
;
int
numInBlock
=
min
(
4
,
numParticles
-
blockStart
);
ivec4
blockAtomIndex
(
blockStart
,
blockStart
+
1
,
blockStart
+
2
,
blockStart
+
3
);
ivec4
blockAtomIndex
(
blockStart
,
blockStart
+
1
,
blockStart
+
2
,
blockStart
+
3
);
float
atomCharge
[
4
],
atomx
[
4
],
atomy
[
4
],
atomz
[
4
];
float
atomCharge
[
4
],
atomx
[
4
],
atomy
[
4
],
atomz
[
4
];
int
blockMask
[
4
]
=
{
0
,
0
,
0
,
0
};
int
blockMask
[
4
]
=
{
0
,
0
,
0
,
0
};
...
@@ -303,13 +310,16 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
...
@@ -303,13 +310,16 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
// Second loop of Born energy computation.
// Second loop of Born energy computation.
for
(
int
blockStart
=
start
;
blockStart
<
end
;
blockStart
+=
4
)
{
while
(
true
)
{
int
blockStart
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
4
);
if
(
blockStart
>=
numParticles
)
break
;
fvec4
bornForce
(
0.0
f
);
fvec4
bornForce
(
0.0
f
);
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
bornForce
+=
fvec4
(
&
threadBornForces
[
i
][
blockStart
]);
bornForce
+=
fvec4
(
&
threadBornForces
[
i
][
blockStart
]);
fvec4
radii
(
&
bornRadii
[
blockStart
]);
fvec4
radii
(
&
bornRadii
[
blockStart
]);
bornForce
*=
radii
*
radii
*
fvec4
(
&
obcChain
[
blockStart
]);
bornForce
*=
radii
*
radii
*
fvec4
(
&
obcChain
[
blockStart
]);
int
numInBlock
=
min
(
4
,
end
-
blockStart
);
int
numInBlock
=
min
(
4
,
numParticles
-
blockStart
);
ivec4
blockAtomIndex
(
blockStart
,
blockStart
+
1
,
blockStart
+
2
,
blockStart
+
3
);
ivec4
blockAtomIndex
(
blockStart
,
blockStart
+
1
,
blockStart
+
2
,
blockStart
+
3
);
float
atomRadius
[
4
],
atomx
[
4
],
atomy
[
4
],
atomz
[
4
];
float
atomRadius
[
4
],
atomx
[
4
],
atomy
[
4
],
atomz
[
4
];
int
blockMask
[
4
]
=
{
0
,
0
,
0
,
0
};
int
blockMask
[
4
]
=
{
0
,
0
,
0
,
0
};
...
@@ -351,14 +361,13 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
...
@@ -351,14 +361,13 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
fvec4
logRatio
=
fastLog
(
u_ij
/
l_ij
);
fvec4
logRatio
=
fastLog
(
u_ij
/
l_ij
);
fvec4
t3
=
0.125
f
*
(
1.0
f
+
scaledRadiusJ2
*
r2Inverse
)
*
(
l_ij2
-
u_ij2
)
+
0.25
f
*
logRatio
*
r2Inverse
;
fvec4
t3
=
0.125
f
*
(
1.0
f
+
scaledRadiusJ2
*
r2Inverse
)
*
(
l_ij2
-
u_ij2
)
+
0.25
f
*
logRatio
*
r2Inverse
;
fvec4
de
=
bornForce
*
t3
*
rInverse
;
fvec4
de
=
bornForce
*
t3
*
rInverse
;
de
=
blend
(
0.0
f
,
de
,
include
);
fvec4
result
[
4
]
=
{
dx
*
de
,
dy
*
de
,
dz
*
de
,
0.0
f
};
fvec4
result
[
4
]
=
{
dx
*
de
,
dy
*
de
,
dz
*
de
,
0.0
f
};
transpose
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
transpose
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
fvec4
atomForce
(
forces
+
4
*
atomJ
);
fvec4
atomForce
(
forces
+
4
*
atomJ
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
if
(
include
[
j
])
{
blockAtomForce
[
j
]
+=
result
[
j
];
blockAtomForce
[
j
]
+=
result
[
j
];
atomForce
-=
result
[
j
];
atomForce
-=
result
[
j
];
}
}
}
atomForce
.
store
(
forces
+
4
*
atomJ
);
atomForce
.
store
(
forces
+
4
*
atomJ
);
}
}
...
@@ -385,21 +394,16 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
...
@@ -385,21 +394,16 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
fvec4
CpuGBSAOBCForce
::
fastLog
(
fvec4
x
)
{
fvec4
CpuGBSAOBCForce
::
fastLog
(
fvec4
x
)
{
// Evaluate log(x) using a lookup table for speed.
// Evaluate log(x) using a lookup table for speed.
float
y
[
4
];
if
(
any
((
x
<
TABLE_MIN
)
|
(
x
>=
TABLE_MAX
)))
fvec4
x1
=
(
x
-
0.5
f
)
*
logDXInv
;
return
fvec4
(
logf
(
x
[
0
]),
logf
(
x
[
1
]),
logf
(
x
[
2
]),
logf
(
x
[
3
]));
fvec4
x1
=
(
x
-
TABLE_MIN
)
*
logDXInv
;
ivec4
index
=
floor
(
x1
);
ivec4
index
=
floor
(
x1
);
fvec4
coeff
[
4
];
fvec4
coeff2
=
x1
-
index
;
coeff
[
1
]
=
x1
-
index
;
fvec4
coeff1
=
1.0
f
-
coeff2
;
coeff
[
0
]
=
1.0
f
-
coeff
[
1
];
fvec4
t1
(
&
logTable
[
index
[
0
]]);
coeff
[
2
]
=
coeff
[
0
]
*
coeff
[
0
]
*
coeff
[
0
]
-
coeff
[
0
];
fvec4
t2
(
&
logTable
[
index
[
1
]]);
coeff
[
3
]
=
coeff
[
1
]
*
coeff
[
1
]
*
coeff
[
1
]
-
coeff
[
1
];
fvec4
t3
(
&
logTable
[
index
[
2
]]);
transpose
(
coeff
[
0
],
coeff
[
1
],
coeff
[
2
],
coeff
[
3
]);
fvec4
t4
(
&
logTable
[
index
[
3
]]);
static
float
maxdiff
=
0.0
f
;
transpose
(
t1
,
t2
,
t3
,
t4
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
return
coeff1
*
t1
+
coeff2
*
t2
;
if
(
index
[
i
]
>=
0
&&
index
[
i
]
<
NUM_TABLE_POINTS
)
y
[
i
]
=
dot4
(
coeff
[
i
],
fvec4
(
&
logTable
[
4
*
index
[
i
]]));
else
y
[
i
]
=
logf
(
x
[
i
]);
}
return
fvec4
(
y
);
}
}
platforms/cpu/src/CpuKernels.cpp
View file @
8e656070
...
@@ -81,16 +81,16 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
...
@@ -81,16 +81,16 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
// Convert the positions to single precision and apply periodic boundary conditions
// Convert the positions to single precision and apply periodic boundary conditions
vector
<
float
>&
posq
=
data
.
posq
;
AlignedArray
<
float
>&
posq
=
data
.
posq
;
vector
<
RealVec
>&
posData
=
extractPositions
(
context
);
vector
<
RealVec
>&
posData
=
extractPositions
(
context
);
RealVec
boxSize
=
extractBoxSize
(
context
);
RealVec
boxSize
=
extractBoxSize
(
context
);
float
float
BoxSize
[
3
]
=
{
(
float
)
boxSize
[
0
],
(
float
)
boxSize
[
1
],
(
float
)
boxSize
[
2
]};
double
inv
BoxSize
[
3
]
=
{
1
/
boxSize
[
0
],
1
/
boxSize
[
1
],
1
/
boxSize
[
2
]};
int
numParticles
=
context
.
getSystem
().
getNumParticles
();
int
numParticles
=
context
.
getSystem
().
getNumParticles
();
if
(
data
.
isPeriodic
)
if
(
data
.
isPeriodic
)
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
for
(
int
j
=
0
;
j
<
3
;
j
++
)
{
for
(
int
j
=
0
;
j
<
3
;
j
++
)
{
RealOpenMM
x
=
posData
[
i
][
j
];
RealOpenMM
x
=
posData
[
i
][
j
];
double
base
=
floor
(
x
/
b
oxSize
[
j
])
*
boxSize
[
j
];
double
base
=
floor
(
x
*
invB
oxSize
[
j
])
*
boxSize
[
j
];
posq
[
4
*
i
+
j
]
=
(
float
)
(
x
-
base
);
posq
[
4
*
i
+
j
]
=
(
float
)
(
x
-
base
);
}
}
else
else
...
@@ -255,12 +255,12 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
...
@@ -255,12 +255,12 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
}
}
}
}
}
}
vector
<
float
>&
posq
=
data
.
posq
;
AlignedArray
<
float
>&
posq
=
data
.
posq
;
vector
<
RealVec
>&
posData
=
extractPositions
(
context
);
vector
<
RealVec
>&
posData
=
extractPositions
(
context
);
vector
<
RealVec
>&
forceData
=
extractForces
(
context
);
vector
<
RealVec
>&
forceData
=
extractForces
(
context
);
RealVec
boxSize
=
extractBoxSize
(
context
);
RealVec
boxSize
=
extractBoxSize
(
context
);
float
floatBoxSize
[
3
]
=
{(
float
)
boxSize
[
0
],
(
float
)
boxSize
[
1
],
(
float
)
boxSize
[
2
]};
float
floatBoxSize
[
3
]
=
{(
float
)
boxSize
[
0
],
(
float
)
boxSize
[
1
],
(
float
)
boxSize
[
2
]};
double
energy
=
ewaldSelfEnergy
;
double
energy
=
(
includeReciprocal
?
ewaldSelfEnergy
:
0.0
)
;
bool
ewald
=
(
nonbondedMethod
==
Ewald
);
bool
ewald
=
(
nonbondedMethod
==
Ewald
);
bool
pme
=
(
nonbondedMethod
==
PME
);
bool
pme
=
(
nonbondedMethod
==
PME
);
if
(
nonbondedMethod
!=
NoCutoff
)
{
if
(
nonbondedMethod
!=
NoCutoff
)
{
...
@@ -330,7 +330,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
...
@@ -330,7 +330,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
PmeIO
io
(
&
posq
[
0
],
&
data
.
threadForce
[
0
][
0
],
numParticles
);
PmeIO
io
(
&
posq
[
0
],
&
data
.
threadForce
[
0
][
0
],
numParticles
);
Vec3
periodicBoxSize
(
boxSize
[
0
],
boxSize
[
1
],
boxSize
[
2
]);
Vec3
periodicBoxSize
(
boxSize
[
0
],
boxSize
[
1
],
boxSize
[
2
]);
optimizedPme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
beginComputation
(
io
,
periodicBoxSize
,
includeEnergy
);
optimizedPme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
beginComputation
(
io
,
periodicBoxSize
,
includeEnergy
);
optimizedPme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
finishComputation
(
io
);
nonbondedEnergy
+=
optimizedPme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
finishComputation
(
io
);
}
}
else
else
nonbonded
.
calculateReciprocalIxn
(
numParticles
,
&
posq
[
0
],
posData
,
particleParams
,
exclusions
,
forceData
,
includeEnergy
?
&
nonbondedEnergy
:
NULL
);
nonbonded
.
calculateReciprocalIxn
(
numParticles
,
&
posq
[
0
],
posData
,
particleParams
,
exclusions
,
forceData
,
includeEnergy
?
&
nonbondedEnergy
:
NULL
);
...
...
platforms/cpu/src/CpuNeighborList.cpp
View file @
8e656070
...
@@ -48,6 +48,8 @@ const int CpuNeighborList::BlockSize = 4;
...
@@ -48,6 +48,8 @@ const int CpuNeighborList::BlockSize = 4;
class
VoxelIndex
class
VoxelIndex
{
{
public:
public:
VoxelIndex
()
:
x
(
0
),
y
(
0
)
{
}
VoxelIndex
(
int
x
,
int
y
)
:
x
(
x
),
y
(
y
)
{
VoxelIndex
(
int
x
,
int
y
)
:
x
(
x
),
y
(
y
)
{
}
}
int
x
;
int
x
;
...
@@ -173,6 +175,9 @@ public:
...
@@ -173,6 +175,9 @@ public:
float
centerPos
[
4
];
float
centerPos
[
4
];
blockCenter
.
store
(
centerPos
);
blockCenter
.
store
(
centerPos
);
VoxelIndex
centerVoxelIndex
=
getVoxelIndex
(
centerPos
);
VoxelIndex
centerVoxelIndex
=
getVoxelIndex
(
centerPos
);
VoxelIndex
atomVoxelIndex
[
BlockSize
];
for
(
int
i
=
0
;
i
<
(
int
)
blockAtoms
.
size
();
i
++
)
atomVoxelIndex
[
i
]
=
getVoxelIndex
(
&
atomLocations
[
4
*
blockAtoms
[
i
]]);
int
startx
=
centerVoxelIndex
.
x
-
dIndexX
;
int
startx
=
centerVoxelIndex
.
x
-
dIndexX
;
int
starty
=
centerVoxelIndex
.
y
-
dIndexY
;
int
starty
=
centerVoxelIndex
.
y
-
dIndexY
;
int
endx
=
centerVoxelIndex
.
x
+
dIndexX
;
int
endx
=
centerVoxelIndex
.
x
+
dIndexX
;
...
@@ -194,39 +199,59 @@ public:
...
@@ -194,39 +199,59 @@ public:
voxelIndex
.
x
=
x
;
voxelIndex
.
x
=
x
;
if
(
usePeriodic
)
if
(
usePeriodic
)
voxelIndex
.
x
=
(
x
<
0
?
x
+
nx
:
(
x
>=
nx
?
x
-
nx
:
x
));
voxelIndex
.
x
=
(
x
<
0
?
x
+
nx
:
(
x
>=
nx
?
x
-
nx
:
x
));
float
dx
=
max
(
0.0
f
,
voxelSizeX
*
max
(
0
,
abs
(
centerVoxelIndex
.
x
-
x
)
-
1
)
-
blockWidth
[
0
]);
for
(
int
y
=
starty
;
y
<=
endy
;
++
y
)
{
for
(
int
y
=
starty
;
y
<=
endy
;
++
y
)
{
voxelIndex
.
y
=
y
;
voxelIndex
.
y
=
y
;
if
(
usePeriodic
)
if
(
usePeriodic
)
voxelIndex
.
y
=
(
y
<
0
?
y
+
ny
:
(
y
>=
ny
?
y
-
ny
:
y
));
voxelIndex
.
y
=
(
y
<
0
?
y
+
ny
:
(
y
>=
ny
?
y
-
ny
:
y
));
float
dy
=
max
(
0.0
f
,
voxelSizeY
*
max
(
0
,
abs
(
centerVoxelIndex
.
y
-
y
)
-
1
)
-
blockWidth
[
1
]);
// Identify the range of atoms within this bin we need to search. When using periodic boundary
// Identify the range of atoms within this bin we need to search. When using periodic boundary
// conditions, there may be two separate ranges.
// conditions, there may be two separate ranges.
float
dz
=
maxDistance
+
blockWidth
[
2
];
float
minz
=
centerPos
[
2
];
dz
=
sqrtf
(
max
(
0.0
f
,
dz
*
dz
-
dx
*
dx
-
dy
*
dy
));
float
maxz
=
centerPos
[
2
];
fvec4
offset
(
voxelSizeX
*
x
+
(
usePeriodic
?
0.0
f
:
minx
),
voxelSizeY
*
y
+
(
usePeriodic
?
0.0
f
:
miny
),
0
,
0
);
for
(
int
k
=
0
;
k
<
(
int
)
blockAtoms
.
size
();
k
++
)
{
const
float
*
atomPos
=
&
atomLocations
[
4
*
blockAtoms
[
k
]];
fvec4
posVec
(
atomPos
);
fvec4
delta1
=
offset
-
posVec
;
fvec4
delta2
=
delta1
+
fvec4
(
voxelSizeX
,
voxelSizeY
,
0
,
0
);
if
(
usePeriodic
)
{
delta1
-=
round
(
delta1
*
invBoxSize
)
*
boxSize
;
delta2
-=
round
(
delta2
*
invBoxSize
)
*
boxSize
;
}
fvec4
delta
=
min
(
abs
(
delta1
),
abs
(
delta2
));
float
dx
=
(
x
==
atomVoxelIndex
[
k
].
x
?
0.0
f
:
delta
[
0
]);
float
dy
=
(
y
==
atomVoxelIndex
[
k
].
y
?
0.0
f
:
delta
[
1
]);
float
dist2
=
maxDistanceSquared
-
dx
*
dx
-
dy
*
dy
;
if
(
dist2
>
0
)
{
float
dist
=
sqrtf
(
dist2
);
minz
=
min
(
minz
,
atomPos
[
2
]
-
dist
);
maxz
=
max
(
maxz
,
atomPos
[
2
]
+
dist
);
}
}
if
(
minz
==
maxz
)
continue
;
bool
needPeriodic
=
(
centerPos
[
0
]
-
blockWidth
[
0
]
<
maxDistance
||
centerPos
[
0
]
+
blockWidth
[
0
]
>
periodicBoxSize
[
0
]
-
maxDistance
||
bool
needPeriodic
=
(
centerPos
[
0
]
-
blockWidth
[
0
]
<
maxDistance
||
centerPos
[
0
]
+
blockWidth
[
0
]
>
periodicBoxSize
[
0
]
-
maxDistance
||
centerPos
[
1
]
-
blockWidth
[
1
]
<
maxDistance
||
centerPos
[
1
]
+
blockWidth
[
1
]
>
periodicBoxSize
[
1
]
-
maxDistance
||
centerPos
[
1
]
-
blockWidth
[
1
]
<
maxDistance
||
centerPos
[
1
]
+
blockWidth
[
1
]
>
periodicBoxSize
[
1
]
-
maxDistance
||
centerPos
[
2
]
-
dz
<
0.0
f
||
centerPos
[
2
]
+
d
z
>
periodicBoxSize
[
2
]);
minz
<
0.0
f
||
max
z
>
periodicBoxSize
[
2
]);
int
rangeStart
[
2
];
int
rangeStart
[
2
];
int
rangeEnd
[
2
];
int
rangeEnd
[
2
];
rangeStart
[
0
]
=
findLowerBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
-
d
z
);
rangeStart
[
0
]
=
findLowerBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
min
z
);
if
(
needPeriodic
)
{
if
(
needPeriodic
)
{
numRanges
=
2
;
numRanges
=
2
;
rangeEnd
[
0
]
=
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
+
d
z
);
rangeEnd
[
0
]
=
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
max
z
);
if
(
rangeStart
[
0
]
>
0
)
{
if
(
rangeStart
[
0
]
>
0
)
{
rangeStart
[
1
]
=
0
;
rangeStart
[
1
]
=
0
;
rangeEnd
[
1
]
=
min
(
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
+
d
z
-
periodicBoxSize
[
2
]),
rangeStart
[
0
]);
rangeEnd
[
1
]
=
min
(
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
max
z
-
periodicBoxSize
[
2
]),
rangeStart
[
0
]);
}
}
else
{
else
{
rangeStart
[
1
]
=
max
(
findLowerBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
-
d
z
+
periodicBoxSize
[
2
]),
rangeEnd
[
0
]);
rangeStart
[
1
]
=
max
(
findLowerBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
min
z
+
periodicBoxSize
[
2
]),
rangeEnd
[
0
]);
rangeEnd
[
1
]
=
bins
[
voxelIndex
.
x
][
voxelIndex
.
y
].
size
();
rangeEnd
[
1
]
=
bins
[
voxelIndex
.
x
][
voxelIndex
.
y
].
size
();
}
}
}
}
else
{
else
{
numRanges
=
1
;
numRanges
=
1
;
rangeEnd
[
0
]
=
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
centerPos
[
2
]
+
d
z
);
rangeEnd
[
0
]
=
findUpperBound
(
voxelIndex
.
x
,
voxelIndex
.
y
,
max
z
);
}
}
// Loop over atoms and check to see if they are neighbors of this block.
// Loop over atoms and check to see if they are neighbors of this block.
...
@@ -307,7 +332,7 @@ public:
...
@@ -307,7 +332,7 @@ public:
CpuNeighborList
::
CpuNeighborList
()
{
CpuNeighborList
::
CpuNeighborList
()
{
}
}
void
CpuNeighborList
::
computeNeighborList
(
int
numAtoms
,
const
vector
<
float
>&
atomLocations
,
const
vector
<
set
<
int
>
>&
exclusions
,
void
CpuNeighborList
::
computeNeighborList
(
int
numAtoms
,
const
AlignedArray
<
float
>&
atomLocations
,
const
vector
<
set
<
int
>
>&
exclusions
,
const
float
*
periodicBoxSize
,
bool
usePeriodic
,
float
maxDistance
,
ThreadPool
&
threads
)
{
const
float
*
periodicBoxSize
,
bool
usePeriodic
,
float
maxDistance
,
ThreadPool
&
threads
)
{
int
numBlocks
=
(
numAtoms
+
BlockSize
-
1
)
/
BlockSize
;
int
numBlocks
=
(
numAtoms
+
BlockSize
-
1
)
/
BlockSize
;
blockNeighbors
.
resize
(
numBlocks
);
blockNeighbors
.
resize
(
numBlocks
);
...
@@ -353,8 +378,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
...
@@ -353,8 +378,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
if
(
!
usePeriodic
)
if
(
!
usePeriodic
)
edgeSizeX
=
edgeSizeY
=
maxDistance
;
// TODO - adjust this as needed
edgeSizeX
=
edgeSizeY
=
maxDistance
;
// TODO - adjust this as needed
else
{
else
{
edgeSizeX
=
0.
5
f
*
periodicBoxSize
[
0
]
/
floorf
(
periodicBoxSize
[
0
]
/
maxDistance
);
edgeSizeX
=
0.
6
f
*
periodicBoxSize
[
0
]
/
floorf
(
periodicBoxSize
[
0
]
/
maxDistance
);
edgeSizeY
=
0.
5
f
*
periodicBoxSize
[
1
]
/
floorf
(
periodicBoxSize
[
1
]
/
maxDistance
);
edgeSizeY
=
0.
6
f
*
periodicBoxSize
[
1
]
/
floorf
(
periodicBoxSize
[
1
]
/
maxDistance
);
}
}
Voxels
voxels
(
edgeSizeX
,
edgeSizeY
,
minx
,
maxx
,
miny
,
maxy
,
periodicBoxSize
,
usePeriodic
);
Voxels
voxels
(
edgeSizeX
,
edgeSizeY
,
minx
,
maxx
,
miny
,
maxy
,
periodicBoxSize
,
usePeriodic
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
...
...
platforms/cpu/src/CpuNonbondedForce.cpp
View file @
8e656070
...
@@ -29,8 +29,8 @@
...
@@ -29,8 +29,8 @@
#include "CpuNonbondedForce.h"
#include "CpuNonbondedForce.h"
#include "ReferenceForce.h"
#include "ReferenceForce.h"
#include "ReferencePME.h"
#include "ReferencePME.h"
#include "openmm/internal/SplineFitter.h"
#include "openmm/internal/vectorize.h"
#include "openmm/internal/vectorize.h"
#include "gmx_atomic.h"
// In case we're using some primitive version of Visual Studio this will
// In case we're using some primitive version of Visual Studio this will
// make sure that erf() and erfc() are defined.
// make sure that erf() and erfc() are defined.
...
@@ -40,7 +40,7 @@ using namespace std;
...
@@ -40,7 +40,7 @@ using namespace std;
using
namespace
OpenMM
;
using
namespace
OpenMM
;
const
float
CpuNonbondedForce
::
TWO_OVER_SQRT_PI
=
(
float
)
(
2
/
sqrt
(
PI_M
));
const
float
CpuNonbondedForce
::
TWO_OVER_SQRT_PI
=
(
float
)
(
2
/
sqrt
(
PI_M
));
const
int
CpuNonbondedForce
::
NUM_TABLE_POINTS
=
1025
;
const
int
CpuNonbondedForce
::
NUM_TABLE_POINTS
=
2048
;
class
CpuNonbondedForce
::
ComputeDirectTask
:
public
ThreadPool
::
Task
{
class
CpuNonbondedForce
::
ComputeDirectTask
:
public
ThreadPool
::
Task
{
public:
public:
...
@@ -58,21 +58,22 @@ public:
...
@@ -58,21 +58,22 @@ public:
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
CpuNonbondedForce
::
CpuNonbondedForce
()
:
cutoff
(
false
),
useSwitch
(
false
),
periodic
(
false
),
ewald
(
false
),
pme
(
false
)
{
CpuNonbondedForce
::
CpuNonbondedForce
()
:
cutoff
(
false
),
useSwitch
(
false
),
periodic
(
false
),
ewald
(
false
),
pme
(
false
)
,
tableIsValid
(
false
)
{
}
}
/**---------------------------------------------------------------------------------------
/**---------------------------------------------------------------------------------------
Set the force to use a cutoff.
Set the force to use a cutoff.
@param distance the cutoff distance
@param distance the cutoff distance
@param neighbors the neighbor list to use
@param neighbors the neighbor list to use
@param solventDielectric the dielectric constant of the bulk solvent
@param solventDielectric the dielectric constant of the bulk solvent
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
void
CpuNonbondedForce
::
setUseCutoff
(
float
distance
,
const
CpuNeighborList
&
neighbors
,
float
solventDielectric
)
{
void
CpuNonbondedForce
::
setUseCutoff
(
float
distance
,
const
CpuNeighborList
&
neighbors
,
float
solventDielectric
)
{
if
(
distance
!=
cutoffDistance
)
tableIsValid
=
false
;
cutoff
=
true
;
cutoff
=
true
;
cutoffDistance
=
distance
;
cutoffDistance
=
distance
;
neighborList
=
&
neighbors
;
neighborList
=
&
neighbors
;
...
@@ -127,6 +128,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
...
@@ -127,6 +128,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
void
CpuNonbondedForce
::
setUseEwald
(
float
alpha
,
int
kmaxx
,
int
kmaxy
,
int
kmaxz
)
{
void
CpuNonbondedForce
::
setUseEwald
(
float
alpha
,
int
kmaxx
,
int
kmaxy
,
int
kmaxz
)
{
if
(
alpha
!=
alphaEwald
)
tableIsValid
=
false
;
alphaEwald
=
alpha
;
alphaEwald
=
alpha
;
numRx
=
kmaxx
;
numRx
=
kmaxx
;
numRy
=
kmaxy
;
numRy
=
kmaxy
;
...
@@ -145,6 +148,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
...
@@ -145,6 +148,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
--------------------------------------------------------------------------------------- */
--------------------------------------------------------------------------------------- */
void
CpuNonbondedForce
::
setUsePME
(
float
alpha
,
int
meshSize
[
3
])
{
void
CpuNonbondedForce
::
setUsePME
(
float
alpha
,
int
meshSize
[
3
])
{
if
(
alpha
!=
alphaEwald
)
tableIsValid
=
false
;
alphaEwald
=
alpha
;
alphaEwald
=
alpha
;
meshDim
[
0
]
=
meshSize
[
0
];
meshDim
[
0
]
=
meshSize
[
0
];
meshDim
[
1
]
=
meshSize
[
1
];
meshDim
[
1
]
=
meshSize
[
1
];
...
@@ -155,24 +160,16 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
...
@@ -155,24 +160,16 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
void
CpuNonbondedForce
::
tabulateEwaldScaleFactor
()
{
void
CpuNonbondedForce
::
tabulateEwaldScaleFactor
()
{
ewaldDX
=
cutoffDistance
/
(
NUM_TABLE_POINTS
-
2
);
if
(
tableIsValid
)
return
;
tableIsValid
=
true
;
ewaldDX
=
cutoffDistance
/
NUM_TABLE_POINTS
;
ewaldDXInv
=
1.0
f
/
ewaldDX
;
ewaldDXInv
=
1.0
f
/
ewaldDX
;
vector
<
double
>
x
(
NUM_TABLE_POINTS
+
1
);
ewaldScaleTable
.
resize
(
NUM_TABLE_POINTS
+
4
);
vector
<
double
>
y
(
NUM_TABLE_POINTS
+
1
);
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
+
4
;
i
++
)
{
vector
<
double
>
deriv
;
double
r
=
i
*
ewaldDX
;
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
+
1
;
i
++
)
{
double
r
=
i
*
cutoffDistance
/
(
NUM_TABLE_POINTS
-
2
);
double
alphaR
=
alphaEwald
*
r
;
double
alphaR
=
alphaEwald
*
r
;
x
[
i
]
=
r
;
ewaldScaleTable
[
i
]
=
erfc
(
alphaR
)
+
TWO_OVER_SQRT_PI
*
alphaR
*
exp
(
-
alphaR
*
alphaR
);
y
[
i
]
=
erfc
(
alphaR
)
+
TWO_OVER_SQRT_PI
*
alphaR
*
exp
(
-
alphaR
*
alphaR
);
}
SplineFitter
::
createNaturalSpline
(
x
,
y
,
deriv
);
ewaldScaleTable
.
resize
(
4
*
NUM_TABLE_POINTS
);
for
(
int
i
=
0
;
i
<
NUM_TABLE_POINTS
;
i
++
)
{
ewaldScaleTable
[
4
*
i
]
=
(
float
)
y
[
i
];
ewaldScaleTable
[
4
*
i
+
1
]
=
(
float
)
y
[
i
+
1
];
ewaldScaleTable
[
4
*
i
+
2
]
=
(
float
)
(
deriv
[
i
]
*
ewaldDX
*
ewaldDX
/
6
);
ewaldScaleTable
[
4
*
i
+
3
]
=
(
float
)
(
deriv
[
i
+
1
]
*
ewaldDX
*
ewaldDX
/
6
);
}
}
}
}
...
@@ -291,7 +288,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
...
@@ -291,7 +288,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
void
CpuNonbondedForce
::
calculateDirectIxn
(
int
numberOfAtoms
,
float
*
posq
,
const
vector
<
RealVec
>&
atomCoordinates
,
const
vector
<
pair
<
float
,
float
>
>&
atomParameters
,
void
CpuNonbondedForce
::
calculateDirectIxn
(
int
numberOfAtoms
,
float
*
posq
,
const
vector
<
RealVec
>&
atomCoordinates
,
const
vector
<
pair
<
float
,
float
>
>&
atomParameters
,
const
vector
<
set
<
int
>
>&
exclusions
,
vector
<
vector
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
)
{
const
vector
<
set
<
int
>
>&
exclusions
,
vector
<
AlignedArray
<
float
>
>&
threadForce
,
double
*
totalEnergy
,
ThreadPool
&
threads
)
{
// Record the parameters for the threads.
// Record the parameters for the threads.
this
->
numberOfAtoms
=
numberOfAtoms
;
this
->
numberOfAtoms
=
numberOfAtoms
;
...
@@ -302,6 +299,9 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
...
@@ -302,6 +299,9 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
this
->
threadForce
=
&
threadForce
;
this
->
threadForce
=
&
threadForce
;
includeEnergy
=
(
totalEnergy
!=
NULL
);
includeEnergy
=
(
totalEnergy
!=
NULL
);
threadEnergy
.
resize
(
threads
.
getNumThreads
());
threadEnergy
.
resize
(
threads
.
getNumThreads
());
gmx_atomic_t
counter
;
gmx_atomic_set
(
&
counter
,
0
);
this
->
atomicCounter
=
&
counter
;
// Signal the threads to start running and wait for them to finish.
// Signal the threads to start running and wait for them to finish.
...
@@ -332,8 +332,12 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
...
@@ -332,8 +332,12 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
if
(
ewald
||
pme
)
{
if
(
ewald
||
pme
)
{
// Compute the interactions from the neighbor list.
// Compute the interactions from the neighbor list.
for
(
int
i
=
threadIndex
;
i
<
neighborList
->
getNumBlocks
();
i
+=
numThreads
)
while
(
true
)
{
calculateBlockEwaldIxn
(
i
,
forces
,
energyPtr
,
boxSize
,
invBoxSize
);
int
nextBlock
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
1
);
if
(
nextBlock
>=
neighborList
->
getNumBlocks
())
break
;
calculateBlockEwaldIxn
(
nextBlock
,
forces
,
energyPtr
,
boxSize
,
invBoxSize
);
}
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
...
@@ -367,13 +371,20 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
...
@@ -367,13 +371,20 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
else
if
(
cutoff
)
{
else
if
(
cutoff
)
{
// Compute the interactions from the neighbor list.
// Compute the interactions from the neighbor list.
for
(
int
i
=
threadIndex
;
i
<
neighborList
->
getNumBlocks
();
i
+=
numThreads
)
while
(
true
)
{
calculateBlockIxn
(
i
,
forces
,
energyPtr
,
boxSize
,
invBoxSize
);
int
nextBlock
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
1
);
if
(
nextBlock
>=
neighborList
->
getNumBlocks
())
break
;
calculateBlockIxn
(
nextBlock
,
forces
,
energyPtr
,
boxSize
,
invBoxSize
);
}
}
}
else
{
else
{
// Loop over all atom pairs
// Loop over all atom pairs
for
(
int
i
=
threadIndex
;
i
<
numberOfAtoms
;
i
+=
numThreads
){
while
(
true
)
{
int
i
=
gmx_atomic_fetch_add
(
reinterpret_cast
<
gmx_atomic_t
*>
(
atomicCounter
),
1
);
if
(
i
>=
numberOfAtoms
)
break
;
for
(
int
j
=
i
+
1
;
j
<
numberOfAtoms
;
j
++
)
for
(
int
j
=
i
+
1
;
j
<
numberOfAtoms
;
j
++
)
if
(
exclusions
[
j
].
find
(
i
)
==
exclusions
[
j
].
end
())
if
(
exclusions
[
j
].
find
(
i
)
==
exclusions
[
j
].
end
())
calculateOneIxn
(
i
,
j
,
forces
,
energyPtr
,
boxSize
,
invBoxSize
);
calculateOneIxn
(
i
,
j
,
forces
,
energyPtr
,
boxSize
,
invBoxSize
);
...
@@ -461,12 +472,12 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
...
@@ -461,12 +472,12 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
break
;
break
;
}
}
}
}
const
float
invSwitchingInterval
=
1
/
(
cutoffDistance
-
switchingDistance
);
// Loop over neighbors for this block.
// Loop over neighbors for this block.
const
vector
<
int
>&
neighbors
=
neighborList
->
getBlockNeighbors
(
blockIndex
);
const
vector
<
int
>&
neighbors
=
neighborList
->
getBlockNeighbors
(
blockIndex
);
const
vector
<
char
>&
exclusions
=
neighborList
->
getBlockExclusions
(
blockIndex
);
const
vector
<
char
>&
exclusions
=
neighborList
->
getBlockExclusions
(
blockIndex
);
bool
include
[
4
];
for
(
int
i
=
0
;
i
<
(
int
)
neighbors
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
neighbors
.
size
();
i
++
)
{
// Load the next neighbor.
// Load the next neighbor.
...
@@ -475,43 +486,50 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
...
@@ -475,43 +486,50 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
// Compute the distances to the block atoms.
// Compute the distances to the block atoms.
bool
any
=
false
;
fvec4
dx
,
dy
,
dz
,
r2
;
fvec4
dx
,
dy
,
dz
,
r2
;
getDeltaR
(
atomPosq
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
getDeltaR
(
atomPosq
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
ivec4
include
;
include
[
j
]
=
(((
exclusions
[
i
]
>>
j
)
&
1
)
==
0
&&
(
!
cutoff
||
r2
[
j
]
<
cutoffDistance
*
cutoffDistance
));
char
excl
=
exclusions
[
i
];
any
|=
include
[
j
];
if
(
excl
==
0
)
}
include
=
-
1
;
if
(
!
any
)
else
include
=
ivec4
(
excl
&
1
?
0
:
-
1
,
excl
&
2
?
0
:
-
1
,
excl
&
4
?
0
:
-
1
,
excl
&
8
?
0
:
-
1
);
include
=
include
&
(
r2
<
cutoffDistance
*
cutoffDistance
);
if
(
!
any
(
include
))
continue
;
// No interactions to compute.
continue
;
// No interactions to compute.
// Compute the interactions.
// Compute the interactions.
fvec4
r
=
sqrt
(
r2
);
fvec4
r
=
sqrt
(
r2
);
fvec4
inverseR
=
fvec4
(
1.0
f
)
/
r
;
fvec4
inverseR
=
fvec4
(
1.0
f
)
/
r
;
fvec4
switchValue
(
1.0
f
),
switchDeriv
(
0.0
f
);
fvec4
energy
,
dEdR
;
if
(
useSwitch
)
{
float
atomEpsilon
=
atomParameters
[
atom
].
second
;
fvec4
t
=
(
r
>
switchingDistance
)
&
((
r
-
switchingDistance
)
/
(
cutoffDistance
-
switchingDistance
));
if
(
atomEpsilon
!=
0.0
f
)
{
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
fvec4
sig
=
blockAtomSigma
+
atomParameters
[
atom
].
first
;
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
/
(
cutoffDistance
-
switchingDistance
);
fvec4
sig2
=
inverseR
*
sig
;
sig2
*=
sig2
;
fvec4
sig6
=
sig2
*
sig2
*
sig2
;
fvec4
epsSig6
=
blockAtomEpsilon
*
atomEpsilon
*
sig6
;
dEdR
=
epsSig6
*
(
12.0
f
*
sig6
-
6.0
f
);
energy
=
epsSig6
*
(
sig6
-
1.0
f
);
if
(
useSwitch
)
{
fvec4
t
=
(
r
>
switchingDistance
)
&
((
r
-
switchingDistance
)
*
invSwitchingInterval
);
fvec4
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
fvec4
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
*
invSwitchingInterval
;
dEdR
=
switchValue
*
dEdR
-
energy
*
switchDeriv
*
r
;
energy
*=
switchValue
;
}
}
else
{
energy
=
0.0
f
;
dEdR
=
0.0
f
;
}
}
fvec4
sig
=
blockAtomSigma
+
atomParameters
[
atom
].
first
;
fvec4
sig2
=
inverseR
*
sig
;
sig2
*=
sig2
;
fvec4
sig6
=
sig2
*
sig2
*
sig2
;
fvec4
eps
=
blockAtomEpsilon
*
atomParameters
[
atom
].
second
;
fvec4
dEdR
=
switchValue
*
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
fvec4
chargeProd
=
blockAtomCharge
*
posq
[
4
*
atom
+
3
];
fvec4
chargeProd
=
blockAtomCharge
*
posq
[
4
*
atom
+
3
];
if
(
cutoff
)
if
(
cutoff
)
dEdR
+=
chargeProd
*
(
inverseR
-
2.0
f
*
krf
*
r2
);
dEdR
+=
chargeProd
*
(
inverseR
-
2.0
f
*
krf
*
r2
);
else
else
dEdR
+=
chargeProd
*
inverseR
;
dEdR
+=
chargeProd
*
inverseR
;
dEdR
*=
inverseR
*
inverseR
;
dEdR
*=
inverseR
*
inverseR
;
fvec4
energy
=
eps
*
(
sig6
-
1.0
f
)
*
sig6
;
if
(
useSwitch
)
{
dEdR
-=
energy
*
switchDeriv
*
inverseR
;
energy
*=
switchValue
;
}
// Accumulate energies.
// Accumulate energies.
...
@@ -520,27 +538,25 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
...
@@ -520,27 +538,25 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
energy
+=
chargeProd
*
(
inverseR
+
krf
*
r2
-
crf
);
energy
+=
chargeProd
*
(
inverseR
+
krf
*
r2
-
crf
);
else
else
energy
+=
chargeProd
*
inverseR
;
energy
+=
chargeProd
*
inverseR
;
for
(
int
j
=
0
;
j
<
4
;
j
++
)
energy
=
blend
(
0.0
f
,
energy
,
include
);
if
(
include
[
j
])
*
totalEnergy
+=
dot4
(
energy
,
1.0
f
);
*
totalEnergy
+=
energy
[
j
];
}
}
// Accumulate forces.
// Accumulate forces.
dEdR
=
blend
(
0.0
f
,
dEdR
,
include
);
fvec4
result
[
4
]
=
{
dx
*
dEdR
,
dy
*
dEdR
,
dz
*
dEdR
,
0.0
f
};
fvec4
result
[
4
]
=
{
dx
*
dEdR
,
dy
*
dEdR
,
dz
*
dEdR
,
0.0
f
};
transpose
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
transpose
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
fvec4
atomForce
(
forces
+
4
*
atom
);
fvec4
atomForce
(
forces
+
4
*
atom
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
if
(
include
[
j
])
{
blockAtomForce
[
j
]
+=
result
[
j
];
blockAtomForce
[
j
]
+=
result
[
j
];
atomForce
-=
result
[
j
];
atomForce
-=
result
[
j
];
}
}
}
atomForce
.
store
(
forces
+
4
*
atom
);
atomForce
.
store
(
forces
+
4
*
atom
);
}
}
// Record the forces on the block atoms.
// Record the forces on the block atoms.
for
(
int
j
=
0
;
j
<
4
;
j
++
)
for
(
int
j
=
0
;
j
<
4
;
j
++
)
(
fvec4
(
forces
+
4
*
blockAtom
[
j
])
+
blockAtomForce
[
j
]).
store
(
forces
+
4
*
blockAtom
[
j
]);
(
fvec4
(
forces
+
4
*
blockAtom
[
j
])
+
blockAtomForce
[
j
]).
store
(
forces
+
4
*
blockAtom
[
j
]);
}
}
...
@@ -569,12 +585,12 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
...
@@ -569,12 +585,12 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
needPeriodic
=
true
;
needPeriodic
=
true
;
break
;
break
;
}
}
const
float
invSwitchingInterval
=
1
/
(
cutoffDistance
-
switchingDistance
);
// Loop over neighbors for this block.
// Loop over neighbors for this block.
const
vector
<
int
>&
neighbors
=
neighborList
->
getBlockNeighbors
(
blockIndex
);
const
vector
<
int
>&
neighbors
=
neighborList
->
getBlockNeighbors
(
blockIndex
);
const
vector
<
char
>&
exclusions
=
neighborList
->
getBlockExclusions
(
blockIndex
);
const
vector
<
char
>&
exclusions
=
neighborList
->
getBlockExclusions
(
blockIndex
);
bool
include
[
4
];
for
(
int
i
=
0
;
i
<
(
int
)
neighbors
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
neighbors
.
size
();
i
++
)
{
// Load the next neighbor.
// Load the next neighbor.
...
@@ -583,60 +599,65 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
...
@@ -583,60 +599,65 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
// Compute the distances to the block atoms.
// Compute the distances to the block atoms.
bool
any
=
false
;
fvec4
dx
,
dy
,
dz
,
r2
;
fvec4
dx
,
dy
,
dz
,
r2
;
getDeltaR
(
atomPosq
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
getDeltaR
(
atomPosq
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
ivec4
include
;
include
[
j
]
=
(((
exclusions
[
i
]
>>
j
)
&
1
)
==
0
&&
r2
[
j
]
<
cutoffDistance
*
cutoffDistance
);
char
excl
=
exclusions
[
i
];
any
|=
include
[
j
];
if
(
excl
==
0
)
}
include
=
-
1
;
if
(
!
any
)
else
include
=
ivec4
(
excl
&
1
?
0
:
-
1
,
excl
&
2
?
0
:
-
1
,
excl
&
4
?
0
:
-
1
,
excl
&
8
?
0
:
-
1
);
include
=
include
&
(
r2
<
cutoffDistance
*
cutoffDistance
);
if
(
!
any
(
include
))
continue
;
// No interactions to compute.
continue
;
// No interactions to compute.
// Compute the interactions.
// Compute the interactions.
fvec4
r
=
sqrt
(
r2
);
fvec4
r
=
sqrt
(
r2
);
fvec4
inverseR
=
fvec4
(
1.0
f
)
/
r
;
fvec4
inverseR
=
fvec4
(
1.0
f
)
/
r
;
fvec4
switchValue
(
1.0
f
),
switchDeriv
(
0.0
f
);
fvec4
energy
,
dEdR
;
if
(
useSwitch
)
{
float
atomEpsilon
=
atomParameters
[
atom
].
second
;
fvec4
t
=
(
r
>
switchingDistance
)
&
((
r
-
switchingDistance
)
/
(
cutoffDistance
-
switchingDistance
));
if
(
atomEpsilon
!=
0.0
f
)
{
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
fvec4
sig
=
blockAtomSigma
+
atomParameters
[
atom
].
first
;
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
/
(
cutoffDistance
-
switchingDistance
);
fvec4
sig2
=
inverseR
*
sig
;
sig2
*=
sig2
;
fvec4
sig6
=
sig2
*
sig2
*
sig2
;
fvec4
epsSig6
=
blockAtomEpsilon
*
atomEpsilon
*
sig6
;
dEdR
=
epsSig6
*
(
12.0
f
*
sig6
-
6.0
f
);
energy
=
epsSig6
*
(
sig6
-
1.0
f
);
if
(
useSwitch
)
{
fvec4
t
=
(
r
>
switchingDistance
)
&
((
r
-
switchingDistance
)
*
invSwitchingInterval
);
fvec4
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
fvec4
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
*
invSwitchingInterval
;
dEdR
=
switchValue
*
dEdR
-
energy
*
switchDeriv
*
r
;
energy
*=
switchValue
;
}
}
}
fvec4
chargeProd
=
blockAtomCharge
*
posq
[
4
*
atom
+
3
];
else
{
fvec4
dEdR
=
chargeProd
*
inverseR
*
ewaldScaleFunction
(
r
);
energy
=
0.0
f
;
fvec4
sig
=
blockAtomSigma
+
atomParameters
[
atom
].
first
;
dEdR
=
0.0
f
;
fvec4
sig2
=
inverseR
*
sig
;
sig2
*=
sig2
;
fvec4
sig6
=
sig2
*
sig2
*
sig2
;
fvec4
eps
=
blockAtomEpsilon
*
atomParameters
[
atom
].
second
;
dEdR
+=
switchValue
*
eps
*
(
12.0
f
*
sig6
-
6.0
f
)
*
sig6
;
dEdR
*=
inverseR
*
inverseR
;
fvec4
energy
=
eps
*
(
sig6
-
1.0
f
)
*
sig6
;
if
(
useSwitch
)
{
dEdR
-=
energy
*
switchDeriv
*
inverseR
;
energy
*=
switchValue
;
}
}
fvec4
chargeProd
=
blockAtomCharge
*
posq
[
4
*
atom
+
3
];
dEdR
+=
chargeProd
*
inverseR
*
ewaldScaleFunction
(
r
);
dEdR
*=
inverseR
*
inverseR
;
// Accumulate energies.
// Accumulate energies.
if
(
totalEnergy
)
{
if
(
totalEnergy
)
{
energy
+=
chargeProd
*
inverseR
*
erfcApprox
(
alphaEwald
*
r
);
energy
+=
chargeProd
*
inverseR
*
erfcApprox
(
alphaEwald
*
r
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
energy
=
blend
(
0.0
f
,
energy
,
include
);
if
(
include
[
j
])
*
totalEnergy
+=
dot4
(
energy
,
1.0
f
);
*
totalEnergy
+=
energy
[
j
];
}
}
// Accumulate forces.
// Accumulate forces.
dEdR
=
blend
(
0.0
f
,
dEdR
,
include
);
fvec4
result
[
4
]
=
{
dx
*
dEdR
,
dy
*
dEdR
,
dz
*
dEdR
,
0.0
f
};
fvec4
result
[
4
]
=
{
dx
*
dEdR
,
dy
*
dEdR
,
dz
*
dEdR
,
0.0
f
};
transpose
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
transpose
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
fvec4
atomForce
(
forces
+
4
*
atom
);
fvec4
atomForce
(
forces
+
4
*
atom
);
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
if
(
include
[
j
])
{
blockAtomForce
[
j
]
+=
result
[
j
];
blockAtomForce
[
j
]
+=
result
[
j
];
atomForce
-=
result
[
j
];
atomForce
-=
result
[
j
];
}
}
}
atomForce
.
store
(
forces
+
4
*
atom
);
atomForce
.
store
(
forces
+
4
*
atom
);
}
}
...
@@ -683,18 +704,14 @@ fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
...
@@ -683,18 +704,14 @@ fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
fvec4
CpuNonbondedForce
::
ewaldScaleFunction
(
fvec4
x
)
{
fvec4
CpuNonbondedForce
::
ewaldScaleFunction
(
fvec4
x
)
{
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
float
y
[
4
];
fvec4
x1
=
x
*
ewaldDXInv
;
fvec4
x1
=
x
*
ewaldDXInv
;
ivec4
index
=
floor
(
x1
);
ivec4
index
=
min
(
floor
(
x1
),
NUM_TABLE_POINTS
);
fvec4
coeff
[
4
];
fvec4
coeff2
=
x1
-
index
;
coeff
[
1
]
=
x1
-
index
;
fvec4
coeff1
=
1.0
f
-
coeff2
;
coeff
[
0
]
=
1.0
f
-
coeff
[
1
];
fvec4
t1
(
&
ewaldScaleTable
[
index
[
0
]]);
coeff
[
2
]
=
coeff
[
0
]
*
coeff
[
0
]
*
coeff
[
0
]
-
coeff
[
0
];
fvec4
t2
(
&
ewaldScaleTable
[
index
[
1
]]);
coeff
[
3
]
=
coeff
[
1
]
*
coeff
[
1
]
*
coeff
[
1
]
-
coeff
[
1
];
fvec4
t3
(
&
ewaldScaleTable
[
index
[
2
]]);
transpose
(
coeff
[
0
],
coeff
[
1
],
coeff
[
2
],
coeff
[
3
]);
fvec4
t4
(
&
ewaldScaleTable
[
index
[
3
]]);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
transpose
(
t1
,
t2
,
t3
,
t4
);
if
(
index
[
i
]
<
NUM_TABLE_POINTS
)
return
coeff1
*
t1
+
coeff2
*
t2
;
y
[
i
]
=
dot4
(
coeff
[
i
],
fvec4
(
&
ewaldScaleTable
[
4
*
index
[
i
]]));
}
return
fvec4
(
y
);
}
}
platforms/cpu/src/CpuPlatform.cpp
View file @
8e656070
...
@@ -32,6 +32,8 @@
...
@@ -32,6 +32,8 @@
#include "CpuPlatform.h"
#include "CpuPlatform.h"
#include "CpuKernelFactory.h"
#include "CpuKernelFactory.h"
#include "CpuKernels.h"
#include "CpuKernels.h"
#include "CpuSETTLE.h"
#include "ReferenceConstraints.h"
#include "openmm/internal/hardware.h"
#include "openmm/internal/hardware.h"
using
namespace
OpenMM
;
using
namespace
OpenMM
;
...
@@ -77,6 +79,12 @@ void CpuPlatform::contextCreated(ContextImpl& context, const map<string, string>
...
@@ -77,6 +79,12 @@ void CpuPlatform::contextCreated(ContextImpl& context, const map<string, string>
ReferencePlatform
::
contextCreated
(
context
,
properties
);
ReferencePlatform
::
contextCreated
(
context
,
properties
);
PlatformData
*
data
=
new
PlatformData
(
context
.
getSystem
().
getNumParticles
());
PlatformData
*
data
=
new
PlatformData
(
context
.
getSystem
().
getNumParticles
());
contextData
[
&
context
]
=
data
;
contextData
[
&
context
]
=
data
;
ReferenceConstraints
&
constraints
=
*
(
ReferenceConstraints
*
)
reinterpret_cast
<
ReferencePlatform
::
PlatformData
*>
(
context
.
getPlatformData
())
->
constraints
;
if
(
constraints
.
settle
!=
NULL
)
{
CpuSETTLE
*
parallelSettle
=
new
CpuSETTLE
(
context
.
getSystem
(),
*
(
ReferenceSETTLEAlgorithm
*
)
constraints
.
settle
,
data
->
threads
);
delete
constraints
.
settle
;
constraints
.
settle
=
parallelSettle
;
}
}
}
void
CpuPlatform
::
contextDestroyed
(
ContextImpl
&
context
)
const
{
void
CpuPlatform
::
contextDestroyed
(
ContextImpl
&
context
)
const
{
...
@@ -89,8 +97,7 @@ CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {
...
@@ -89,8 +97,7 @@ CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {
return
*
contextData
[
&
context
];
return
*
contextData
[
&
context
];
}
}
CpuPlatform
::
PlatformData
::
PlatformData
(
int
numParticles
)
{
CpuPlatform
::
PlatformData
::
PlatformData
(
int
numParticles
)
:
posq
(
4
*
numParticles
)
{
posq
.
resize
(
4
*
numParticles
);
int
numThreads
=
threads
.
getNumThreads
();
int
numThreads
=
threads
.
getNumThreads
();
threadForce
.
resize
(
numThreads
);
threadForce
.
resize
(
numThreads
);
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
...
...
platforms/cpu/src/CpuSETTLE.cpp
0 → 100644
View file @
8e656070
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CpuSETTLE.h"
using
namespace
OpenMM
;
using
namespace
std
;
class
CpuSETTLE
::
ApplyToPositionsTask
:
public
ThreadPool
::
Task
{
public:
ApplyToPositionsTask
(
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
vector
<
OpenMM
::
RealVec
>&
atomCoordinatesP
,
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
,
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
)
:
atomCoordinates
(
atomCoordinates
),
atomCoordinatesP
(
atomCoordinatesP
),
inverseMasses
(
inverseMasses
),
tolerance
(
tolerance
),
threadSettle
(
threadSettle
)
{
}
void
execute
(
ThreadPool
&
threads
,
int
threadIndex
)
{
threadSettle
[
threadIndex
]
->
apply
(
atomCoordinates
,
atomCoordinatesP
,
inverseMasses
,
tolerance
);
}
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
;
vector
<
OpenMM
::
RealVec
>&
atomCoordinatesP
;
vector
<
RealOpenMM
>&
inverseMasses
;
RealOpenMM
tolerance
;
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
;
};
class
CpuSETTLE
::
ApplyToVelocitiesTask
:
public
ThreadPool
::
Task
{
public:
ApplyToVelocitiesTask
(
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
vector
<
OpenMM
::
RealVec
>&
velocities
,
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
,
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
)
:
atomCoordinates
(
atomCoordinates
),
velocities
(
velocities
),
inverseMasses
(
inverseMasses
),
tolerance
(
tolerance
),
threadSettle
(
threadSettle
)
{
}
void
execute
(
ThreadPool
&
threads
,
int
threadIndex
)
{
threadSettle
[
threadIndex
]
->
applyToVelocities
(
atomCoordinates
,
velocities
,
inverseMasses
,
tolerance
);
}
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
;
vector
<
OpenMM
::
RealVec
>&
velocities
;
vector
<
RealOpenMM
>&
inverseMasses
;
RealOpenMM
tolerance
;
vector
<
ReferenceSETTLEAlgorithm
*>&
threadSettle
;
};
CpuSETTLE
::
CpuSETTLE
(
const
System
&
system
,
const
ReferenceSETTLEAlgorithm
&
settle
,
ThreadPool
&
threads
)
:
threads
(
threads
)
{
int
numThreads
=
threads
.
getNumThreads
();
int
numClusters
=
settle
.
getNumClusters
();
vector
<
RealOpenMM
>
mass
(
system
.
getNumParticles
());
for
(
int
i
=
0
;
i
<
system
.
getNumParticles
();
i
++
)
mass
[
i
]
=
system
.
getParticleMass
(
i
);
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
{
int
start
=
i
*
numClusters
/
numThreads
;
int
end
=
(
i
+
1
)
*
numClusters
/
numThreads
;
if
(
start
!=
end
)
{
int
numThreadClusters
=
end
-
start
;
vector
<
int
>
atom1
(
numThreadClusters
),
atom2
(
numThreadClusters
),
atom3
(
numThreadClusters
);
vector
<
RealOpenMM
>
distance1
(
numThreadClusters
),
distance2
(
numThreadClusters
);
for
(
int
j
=
0
;
j
<
numThreadClusters
;
j
++
)
settle
.
getClusterParameters
(
start
+
j
,
atom1
[
j
],
atom2
[
j
],
atom3
[
j
],
distance1
[
j
],
distance2
[
j
]);
threadSettle
.
push_back
(
new
ReferenceSETTLEAlgorithm
(
atom1
,
atom2
,
atom3
,
distance1
,
distance2
,
mass
));
}
}
}
CpuSETTLE
::~
CpuSETTLE
()
{
for
(
int
i
=
0
;
i
<
(
int
)
threadSettle
.
size
();
i
++
)
delete
threadSettle
[
i
];
}
void
CpuSETTLE
::
apply
(
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
vector
<
OpenMM
::
RealVec
>&
atomCoordinatesP
,
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
)
{
ApplyToPositionsTask
task
(
atomCoordinates
,
atomCoordinatesP
,
inverseMasses
,
tolerance
,
threadSettle
);
threads
.
execute
(
task
);
threads
.
waitForThreads
();
}
void
CpuSETTLE
::
applyToVelocities
(
vector
<
OpenMM
::
RealVec
>&
atomCoordinates
,
vector
<
OpenMM
::
RealVec
>&
velocities
,
vector
<
RealOpenMM
>&
inverseMasses
,
RealOpenMM
tolerance
)
{
ApplyToVelocitiesTask
task
(
atomCoordinates
,
velocities
,
inverseMasses
,
tolerance
,
threadSettle
);
threads
.
execute
(
task
);
threads
.
waitForThreads
();
}
platforms/cpu/src/gmx_atomic.h
0 → 100644
View file @
8e656070
/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*-
*
* Copyright (c) 2004-2008, Erik Lindahl <lindahl@cbr.su.se>
*
* Unfortunately, some of the constructs in this file are _very_ sensitive
* to compiler optimizations and architecture changes. If you find any such
* errors, please send a message to lindahl@cbr.su.se to help us fix the
* upstream version too.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* And Hey:
* Gnomes, ROck Monsters And Chili Sauce
*/
#ifndef _GMX_ATOMIC_H_
#define _GMX_ATOMIC_H_
/*! \file gmx_atomic.h
*
* @brief Atomic operations for fast SMP synchronization
*
* This file defines atomic integer operations and spinlocks for
* fast synchronization in performance-critical regions of gromacs.
*
* In general, the best option is to use functions without explicit
* locking, e.g. gmx_atomic_fetch_add() or gmx_atomic_cmpxchg().
*
* Not all architecture support atomic operations though inline assembly,
* and even if they do it might not be implemented here. In that case
* we use a fallback mutex implementation, so you can always count on
* the function interfaces working in Gromacs.
*
* Don't use spinlocks in non-performance-critical regions like file I/O.
* Since they always spin busy they would waste CPU cycles instead of
* properly yielding to a computation thread while waiting for the disk.
*
* Finally, note that all our spinlock operations are defined to return
* 0 if initialization or locking completes successfully.
* This is the opposite of some other implementations, but the same standard
* as used for pthread mutexes. So, if e.g. are trying to lock a spinlock,
* you will have gotten the lock if the return value is 0.
*
* gmx_spinlock_islocked(x) obviously still returns 1 if the lock is locked,
* and 0 if it is available, though...
*/
#include <stdio.h>
#include <pthread.h>
#ifdef __cplusplus
extern
"C"
{
#endif
#if 0
} /* Avoids screwing up auto-indentation */
#endif
#if ( ( (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__)) && \
(defined(i386) || defined(__x86_64__)) ) \
|| defined (DOXYGEN) )
/* This code is executed for x86 and x86-64, with these compilers:
* GNU
* Intel
* Pathscale
* All these support GCC-style inline assembly.
* We also use this section for the documentation.
*/
/*! \brief Memory barrier operation
*
* Modern CPUs rely heavily on out-of-order execution, and one common feature
* is that load/stores might be reordered. Also, when using inline assembly
* the compiler might already have loaded the variable we are changing into
* a register, so any update to memory won't be visible.
*
* This command creates a memory barrier, i.e. all memory results before
* it in the code should be visible to all memory operations after it - the
* CPU cannot propagate load/stores across it.
*/
#define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory")
/* Only gcc and Intel support this check, otherwise set it to true (skip doc) */
#if (!defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined DOXYGEN)
#define __builtin_constant_p(i) (1)
#endif
/*! \brief Gromacs atomic operations datatype
*
* Portable synchronization primitives like mutexes are effective for
* many purposes, but usually not very high performance.
* One of the problem is that you have the overhead of a function call,
* and another is that Mutexes often have extra overhead to make the
* scheduling fair. Finally, if performance is important we don't want
* to suspend the thread if we cannot lock a mutex, but spin-lock at 100%
* CPU usage until the resources is available (e.g. increment a counter).
*
* These things can often be implemented with inline-assembly or other
* system-dependent functions, and we provide such functionality for the
* most common platforms. For portability we also have a fallback
* implementation using a mutex for locking.
*
* Performance-wise, the fastest solution is always to avoid locking
* completely (obvious, but remember it!). If you cannot do that, the
* next best thing is to use atomic operations that e.g. increment a
* counter without explicit locking. Spinlocks are useful to lock an
* entire region, but leads to more overhead and can be difficult to
* debug - it is up to you to make sure that only the thread owning the
* lock unlocks it!
*
* You should normally NOT use atomic operations for things like
* I/O threads. These should yield to other threads while waiting for
* the disk instead of spinning at 100% CPU usage.
*
* It is imperative that you use the provided routines for reading
* and writing, since some implementations require memory barriers before
* the CPU or memory sees an updated result. The structure contents is
* only visible here so it can be inlined for performance - it might
* change without further notice.
*
* \note No initialization is required for atomic variables.
*
* Currently, we have (real) atomic operations for:
*
* - x86 or x86_64, using GNU compilers
* - x86 or x86_64, using Intel compilers
* - x86 or x86_64, using Pathscale compilers
* - Itanium, using GNU compilers
* - Itanium, using Intel compilers
* - Itanium, using HP compilers
* - PowerPC, using GNU compilers
* - PowerPC, using IBM AIX compilers
* - PowerPC, using IBM compilers >=7.0 under Linux or Mac OS X.
*/
typedef
struct
gmx_atomic
{
volatile
int
value
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t
;
/*! \brief Gromacs spinlock
*
* Spinlocks provide a faster synchronization than mutexes,
* although they consume CPU-cycles while waiting. They are implemented
* with atomic operations and inline assembly whenever possible, and
* otherwise we use a fallback implementation where a spinlock is identical
* to a mutex (this is one of the reasons why you have to initialize them).
*
* There are no guarantees whatsoever about fair scheduling or
* debugging if you make a mistake and unlock a variable somebody
* else has locked - performance is the primary goal of spinlocks.
*
*/
typedef
struct
gmx_spinlock
{
volatile
unsigned
int
lock
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t
;
/*! \brief Spinlock static initializer
*
* This is used for static spinlock initialization, and has the same
* properties as GMX_THREAD_MUTEX_INITIALIZER has for mutexes.
* This is only for inlining in the gmx_thread.h header file. Whether
* it is 0, 1, or something else when unlocked depends on the platform.
* Don't assume anything about it. It might even be a mutex when using the
* fallback implementation!
*/
#define GMX_SPINLOCK_INITIALIZER { 1 }
/*! \brief Return value of an atomic integer
*
* Also implements proper memory barriers when necessary.
* The actual implementation is system-dependent.
*
* \param a Atomic variable to read
* \return Integer value of the atomic variable
*/
#define gmx_atomic_read(a) ((a)->value)
/*! \brief Write value to an atomic integer
*
* Also implements proper memory barriers when necessary.
* The actual implementation is system-dependent.
*
* \param a Atomic variable
* \param i Integer to set the atomic variable to.
*/
#define gmx_atomic_set(a,i) (((a)->value) = (i))
/*! \brief Add integer to atomic variable
*
* Also implements proper memory barriers when necessary.
* The actual implementation is system-dependent.
*
* \param a atomic datatype to modify
* \param i integer to increment with. Use i<0 to subtract atomically.
*
* \return The new value (after summation).
*/
static
inline
int
gmx_atomic_add_return
(
gmx_atomic_t
*
a
,
volatile
int
i
)
{
int
__i
;
__i
=
i
;
__asm__
__volatile__
(
"lock ; xaddl %0, %1;"
:
"=r"
(
i
)
:
"m"
(
a
->
value
),
"0"
(
i
));
return
i
+
__i
;
}
/*! \brief Add to variable, return the old value.
*
* This operation is quite useful for synchronization counters.
* By performing a fetchadd with N, a thread can e.g. reserve a chunk
* with the next N iterations, and the return value is the index
* of the first element to treat.
*
* Also implements proper memory barriers when necessary.
* The actual implementation is system-dependent.
*
* \param a atomic datatype to modify
* \param i integer to increment with. Use i<0 to subtract atomically.
*
* \return The value of the atomic variable before addition.
*/
static
inline
int
gmx_atomic_fetch_add
(
gmx_atomic_t
*
a
,
volatile
int
i
)
{
int
__i
;
__i
=
i
;
__asm__
__volatile__
(
"lock ; xaddl %0, %1;"
:
"=r"
(
i
)
:
"m"
(
a
->
value
),
"0"
(
i
));
return
i
;
}
/*! \brief Atomic compare-exchange operation
*
* The \a old value is compared with the memory value in the atomic datatype.
* If the are identical, the atomic type is updated to the new value,
* and otherwise left unchanged.
*
* This is a very useful synchronization primitive: You can start by reading
* a value (without locking anything), perform some calculations, and then
* atomically try to update it in memory unless it has changed. If it has
* changed you will get an error return code - reread the new value
* an repeat the calculations in that case.
*
* \param a Atomic datatype ('memory' value)
* \param oldval Integer value read from the atomic type at an earlier point
* \param newval New value to write to the atomic type if it currently is
* identical to the old value.
*
* \return The value of the atomic memory variable in memory when this
* instruction was executed. This, if the operation succeeded the
* return value was identical to the \a old parameter, and if not
* it returns the updated value in memory so you can repeat your
* operations on it.
*
* \note The exchange occured if the return value is identical to \a old.
*/
static
inline
int
gmx_atomic_cmpxchg
(
gmx_atomic_t
*
a
,
int
oldval
,
int
newval
)
{
volatile
unsigned
long
prev
;
__asm__
__volatile__
(
"lock ; cmpxchgl %1,%2"
:
"=a"
(
prev
)
:
"q"
(
newval
),
"m"
(
a
->
value
),
"0"
(
oldval
)
:
"memory"
);
return
prev
;
}
/*! \brief Initialize spinlock
*
* In theory you can call this from multiple threads, but remember
* that we don't check for errors. If the first thread proceeded to
* lock the spinlock after initialization, the second will happily
* overwrite the contents and unlock it without warning you.
*
* \param x Gromacs spinlock pointer.
*/
static
inline
void
gmx_spinlock_init
(
gmx_spinlock_t
*
x
)
{
x
->
lock
=
1
;
}
/*! \brief Acquire spinlock
*
* This routine blocks until the spinlock is available, and
* the locks it again before returning.
*
* \param x Gromacs spinlock pointer
*/
static
inline
void
gmx_spinlock_lock
(
gmx_spinlock_t
*
x
)
{
__asm__
__volatile__
(
"
\n
1:
\t
"
"lock ; decb %0
\n\t
"
"jns 3f
\n
"
"2:
\t
"
"rep;nop
\n\t
"
"cmpb $0,%0
\n\t
"
"jle 2b
\n\t
"
"jmp 1b
\n
"
"3:
\n\t
"
:
"=m"
(
x
->
lock
)
:
:
"memory"
);
}
/*! \brief Attempt to acquire spinlock
*
* This routine acquires the spinlock if possible, but if
* already locked it return an error code immediately.
*
* \param x Gromacs spinlock pointer
*
* \return 0 if the mutex was available so we could lock it,
* otherwise a non-zero integer (1) if the lock is busy.
*/
static
inline
int
gmx_spinlock_trylock
(
gmx_spinlock_t
*
x
)
{
char
old_value
;
__asm__
__volatile__
(
"xchgb %b0,%1"
:
"=q"
(
old_value
),
"=m"
(
x
->
lock
)
:
"0"
(
0
)
:
"memory"
);
return
(
old_value
<=
0
);
}
/*! \brief Release spinlock
*
* \param x Gromacs spinlock pointer
*
* Unlocks the spinlock, regardless if which thread locked it.
*/
static
inline
void
gmx_spinlock_unlock
(
gmx_spinlock_t
*
x
)
{
char
old_value
=
1
;
__asm__
__volatile__
(
"xchgb %b0, %1"
:
"=q"
(
old_value
),
"=m"
(
x
->
lock
)
:
"0"
(
old_value
)
:
"memory"
);
}
/*! \brief Check if spinlock is locked
*
* This routine returns immediately with the lock status.
*
* \param x Gromacs spinlock pointer
*
* \return 1 if the spinlock is locked, 0 otherwise.
*/
static
inline
int
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
return
(
*
(
volatile
signed
char
*
)(
&
(
x
)
->
lock
)
<=
0
);
}
/*! \brief Wait for a spinlock to become available
*
* This routine blocks until the spinlock is unlocked,
* but in contrast to gmx_spinlock_lock() it returns without
* trying to lock the spinlock.
*
* \param x Gromacs spinlock pointer
*/
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
do
{
gmx_atomic_memory_barrier
();
}
while
(
gmx_spinlock_islocked
(
x
));
}
#elif ( defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__)))
/* PowerPC using proper GCC inline assembly.
* Recent versions of xlC (>=7.0) _partially_ support this, but since it is
* not 100% compatible we provide a separate implementation for xlC in
* the next section.
*/
/* Compiler-dependent stuff: GCC memory barrier */
#define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory")
typedef
struct
gmx_atomic
{
volatile
int
value
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t
;
typedef
struct
gmx_spinlock
{
volatile
unsigned
int
lock
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t
;
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
static
inline
int
gmx_atomic_add_return
(
gmx_atomic_t
*
a
,
int
i
)
{
int
t
;
__asm__
__volatile__
(
"1: lwarx %0,0,%2
\n
"
"
\t
add %0,%1,%0
\n
"
"
\t
stwcx. %0,0,%2
\n
"
"
\t
bne- 1b"
"
\t
isync
\n
"
:
"=&r"
(
t
)
:
"r"
(
i
),
"r"
(
&
a
->
value
)
:
"cc"
,
"memory"
);
return
t
;
}
static
inline
int
gmx_atomic_fetch_add
(
gmx_atomic_t
*
a
,
int
i
)
{
int
t
;
__asm__
__volatile__
(
"
\t
eieio
\n
"
"1: lwarx %0,0,%2
\n
"
"
\t
add %0,%1,%0
\n
"
"
\t
stwcx. %0,0,%2
\n
"
"
\t
bne- 1b
\n
"
"
\t
isync
\n
"
:
"=&r"
(
t
)
:
"r"
(
i
),
"r"
(
&
a
->
value
)
:
"cc"
,
"memory"
);
return
(
t
-
i
);
}
static
inline
int
gmx_atomic_cmpxchg
(
gmx_atomic_t
*
a
,
int
oldval
,
int
newval
)
{
int
prev
;
__asm__
__volatile__
(
"1: lwarx %0,0,%2
\n
"
"
\t
cmpw 0,%0,%3
\n
"
"
\t
bne 2f
\n
"
"
\t
stwcx. %4,0,%2
\n
"
"bne- 1b
\n
"
"
\t
sync
\n
"
"2:
\n
"
:
"=&r"
(
prev
),
"=m"
(
a
->
value
)
:
"r"
(
&
a
->
value
),
"r"
(
oldval
),
"r"
(
newval
),
"m"
(
a
->
value
)
:
"cc"
,
"memory"
);
return
prev
;
}
static
inline
void
gmx_spinlock_init
(
gmx_spinlock_t
*
x
)
{
x
->
lock
=
0
;
}
static
inline
void
gmx_spinlock_lock
(
gmx_spinlock_t
*
x
)
{
unsigned
int
tmp
;
__asm__
__volatile__
(
"
\t
b 1f
\n
"
"2: lwzx %0,0,%1
\n
"
"
\t
cmpwi 0,%0,0
\n
"
"
\t
bne+ 2b
\n
"
"1: lwarx %0,0,%1
\n
"
"
\t
cmpwi 0,%0,0
\n
"
"
\t
bne- 2b
\n
"
"
\t
stwcx. %2,0,%1
\n
"
"bne- 2b
\n
"
"
\t
isync
\n
"
:
"=&r"
(
tmp
)
:
"r"
(
&
x
->
lock
),
"r"
(
1
)
:
"cr0"
,
"memory"
);
}
static
inline
int
gmx_spinlock_trylock
(
gmx_spinlock_t
*
x
)
{
unsigned
int
old
,
t
;
unsigned
int
mask
=
1
;
volatile
unsigned
int
*
p
=
&
x
->
lock
;
__asm__
__volatile__
(
"
\t
eieio
\n
"
"1: lwarx %0,0,%4
\n
"
"
\t
or %1,%0,%3
\n
"
"
\t
stwcx. %1,0,%4
\n
"
"
\t
bne 1b
\n
"
"
\t
sync
\n
"
:
"=&r"
(
old
),
"=&r"
(
t
),
"=m"
(
*
p
)
:
"r"
(
mask
),
"r"
(
p
),
"m"
(
*
p
)
:
"cc"
,
"memory"
);
return
((
old
&
mask
)
!=
0
);
}
static
inline
void
gmx_spinlock_unlock
(
gmx_spinlock_t
*
x
)
{
__asm__
__volatile__
(
"
\t
eieio
\n
"
:
:
:
"memory"
);
x
->
lock
=
0
;
}
static
inline
int
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
return
(
x
->
lock
!=
0
);
}
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
do
{
gmx_atomic_memory_barrier
();
}
while
(
gmx_spinlock_islocked
(
x
));
}
#elif ( (defined(__IBM_GCC_ASM) || defined(__IBM_STDCPP_ASM)) && \
(defined(__powerpc__) || defined(__ppc__)))
/* PowerPC using xlC inline assembly.
* Recent versions of xlC (>=7.0) _partially_ support GCC inline assembly
* if you use the option -qasm=gcc but we have had to hack things a bit, in
* particular when it comes to clobbered variables. Since this implementation
* _could_ be buggy, we have separated it from the known-to-be-working gcc
* one above.
*/
/* memory barrier - no idea how to create one with xlc! */
#define gmx_atomic_memory_barrier()
typedef
struct
gmx_atomic
{
volatile
int
value
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t
;
typedef
struct
gmx_spinlock
{
volatile
unsigned
int
lock
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t
;
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
static
inline
int
gmx_atomic_add_return
(
gmx_atomic_t
*
a
,
int
i
)
{
int
t
;
__asm__
__volatile__
(
"1: lwarx %0,0,%2
\n
"
"
\t
add %0,%1,%0
\n
"
"
\t
stwcx. %0,0,%2
\n
"
"
\t
bne- 1b
\n
"
"
\t
isync
\n
"
:
"=&r"
(
t
)
:
"r"
(
i
),
"r"
(
&
a
->
value
)
);
return
t
;
}
static
inline
int
gmx_atomic_fetch_add
(
gmx_atomic_t
*
a
,
int
i
)
{
int
t
;
__asm__
__volatile__
(
"
\t
eieio
\n
"
"1: lwarx %0,0,%2
\n
"
"
\t
add %0,%1,%0
\n
"
"
\t
stwcx. %0,0,%2
\n
"
"
\t
bne- 1b
\n
"
"
\t
isync
\n
"
:
"=&r"
(
t
)
:
"r"
(
i
),
"r"
(
&
a
->
value
));
return
(
t
-
i
);
}
static
inline
int
gmx_atomic_cmpxchg
(
gmx_atomic_t
*
a
,
int
oldval
,
int
newval
)
{
int
prev
;
__asm__
__volatile__
(
"1: lwarx %0,0,%2
\n
"
"
\t
cmpw 0,%0,%3
\n
"
"
\t
bne 2f
\n
"
"
\t
stwcx. %4,0,%2
\n
"
"
\t
bne- 1b
\n
"
"
\t
sync
\n
"
"2:
\n
"
:
"=&r"
(
prev
),
"=m"
(
a
->
value
)
:
"r"
(
&
a
->
value
),
"r"
(
oldval
),
"r"
(
newval
),
"m"
(
a
->
value
));
return
prev
;
}
static
inline
void
gmx_spinlock_init
(
gmx_spinlock_t
*
x
)
{
x
->
lock
=
0
;
}
static
inline
void
gmx_spinlock_lock
(
gmx_spinlock_t
*
x
)
{
unsigned
int
tmp
;
__asm__
__volatile__
(
"
\t
b 1f
\n
"
"2: lwzx %0,0,%1
\n
"
"
\t
cmpwi 0,%0,0
\n
"
"
\t
bne+ 2b
\n
"
"1: lwarx %0,0,%1
\n
"
"
\t
cmpwi 0,%0,0
\n
"
"
\t
bne- 2b
\n
"
"
\t
stwcx. %2,0,%1
\n
"
"
\t
bne- 2b
\n
"
"
\t
isync
\n
"
:
"=&r"
(
tmp
)
:
"r"
(
&
x
->
lock
),
"r"
(
1
));
}
static
inline
int
gmx_spinlock_trylock
(
gmx_spinlock_t
*
x
)
{
unsigned
int
old
,
t
;
unsigned
int
mask
=
1
;
volatile
unsigned
int
*
p
=
&
x
->
lock
;
__asm__
__volatile__
(
"
\t
eieio
\n
"
"1: lwarx %0,0,%4
\n
"
"
\t
or %1,%0,%3
\n
"
"
\t
stwcx. %1,0,%4
\n
"
"
\t
bne 1b
\n
"
"
\t
sync
\n
"
:
"=&r"
(
old
),
"=&r"
(
t
),
"=m"
(
*
p
)
:
"r"
(
mask
),
"r"
(
p
),
"m"
(
*
p
));
return
((
old
&
mask
)
!=
0
);
}
static
inline
void
gmx_spinlock_unlock
(
gmx_spinlock_t
*
x
)
{
__asm__
__volatile__
(
"
\t
eieio
\n
"
);
x
->
lock
=
0
;
}
static
inline
void
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
return
(
x
->
lock
!=
0
);
}
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
do
{
gmx_atomic_memory_barrier
();
}
while
(
gmx_spinlock_islocked
(
x
));
}
#elif (defined(__ia64__) && (defined(__GNUC__) || defined(__INTEL_COMPILER)))
/* ia64 with GCC or Intel compilers. Since we need to define everything through
* cmpxchg and fetchadd on ia64, we merge the different compilers and only provide
* different implementations for that single function.
* Documentation? Check the gcc/x86 section.
*/
typedef
struct
gmx_atomic
{
volatile
int
value
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t
;
typedef
struct
gmx_spinlock
{
volatile
unsigned
int
lock
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t
;
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
/* Compiler thingies */
#ifdef __INTEL_COMPILER
void
__memory_barrier
(
void
);
int
_InterlockedCompareExchange
(
volatile
int
*
dest
,
int
xchg
,
int
comp
);
unsigned
__int64
__fetchadd4_rel
(
unsigned
int
*
addend
,
const
int
increment
);
/* ia64 memory barrier */
# define gmx_atomic_memory_barrier() __memory_barrier()
/* ia64 cmpxchg */
# define gmx_atomic_cmpxchg(a, oldval, newval) _InterlockedCompareExchange(&a->value,newval,oldval)
/* ia64 fetchadd, but it only works with increments +/- 1,4,8,16 */
# define gmx_ia64_fetchadd(a, inc) __fetchadd4_rel(a, inc)
#elif defined __GNUC__
/* ia64 memory barrier */
# define gmx_atomic_memory_barrier() asm volatile ("":::"memory")
/* ia64 cmpxchg */
static
inline
int
gmx_atomic_cmpxchg
(
gmx_atomic_t
*
a
,
int
oldval
,
int
newval
)
{
volatile
int
res
;
asm
volatile
(
"mov ar.ccv=%0;;"
::
"rO"
(
oldval
));
asm
volatile
(
"cmpxchg4.acq %0=[%1],%2,ar.ccv"
:
"=r"
(
res
)
:
"r"
(
&
a
->
value
),
"r"
(
newval
)
:
"memory"
);
return
res
;
}
/* fetchadd, but on ia64 it only works with increments +/- 1,4,8,16 */
#define gmx_ia64_fetchadd(a, inc) \
({ unsigned long res; \
asm volatile ("fetchadd4.rel %0=[%1],%2" \
: "=r"(res) : "r"(a), "i" (inc) : "memory"); \
res; \
})
#else
/* Unknown compiler */
# error Unknown ia64 compiler (not GCC or ICC) - modify gmx_thread.h!
#endif
static
inline
int
gmx_atomic_add_return
(
gmx_atomic_t
*
a
,
volatile
int
i
)
{
volatile
int
oldval
,
newval
;
volatile
int
__i
=
i
;
/* Use fetchadd if, and only if, the increment value can be determined
* at compile time (otherwise this check is optimized away) and it is
* a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16).
*/
if
(
__builtin_constant_p
(
i
)
&&
(
(
__i
==
1
)
||
(
__i
==
4
)
||
(
__i
==
8
)
||
(
__i
==
16
)
||
(
__i
==
-
1
)
||
(
__i
==
-
4
)
||
(
__i
==
-
8
)
||
(
__i
==
-
16
)
)
)
{
oldval
=
gmx_ia64_fetchadd
(
a
,
__i
);
newval
=
oldval
+
i
;
}
else
{
/* Use compare-exchange addition that works with any value */
do
{
oldval
=
gmx_atomic_read
(
a
);
newval
=
oldval
+
i
;
}
while
(
gmx_atomic_cmpxchg
(
a
,
oldval
,
newval
)
!=
oldval
);
}
return
newval
;
}
static
inline
int
gmx_atomic_fetch_add
(
gmx_atomic_t
*
a
,
volatile
int
i
)
{
volatile
int
oldval
,
newval
;
volatile
int
__i
=
i
;
/* Use ia64 fetchadd if, and only if, the increment value can be determined
* at compile time (otherwise this check is optimized away) and it is
* a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16).
*/
if
(
__builtin_constant_p
(
i
)
&&
(
(
__i
==
1
)
||
(
__i
==
4
)
||
(
__i
==
8
)
||
(
__i
==
16
)
||
(
__i
==
-
1
)
||
(
__i
==
-
4
)
||
(
__i
==
-
8
)
||
(
__i
==
-
16
)
)
)
{
oldval
=
gmx_ia64_fetchadd
(
a
,
__i
);
newval
=
oldval
+
i
;
}
else
{
/* Use compare-exchange addition that works with any value */
do
{
oldval
=
gmx_atomic_read
(
a
);
newval
=
oldval
+
i
;
}
while
(
gmx_atomic_cmpxchg
(
a
,
oldval
,
newval
)
!=
oldval
);
}
return
oldval
;
}
static
inline
void
gmx_spinlock_init
(
gmx_spinlock_t
*
x
)
{
x
->
lock
=
0
;
}
static
inline
void
gmx_spinlock_lock
(
gmx_spinlock_t
*
x
)
{
gmx_atomic_t
*
a
=
(
gmx_atomic_t
*
)
x
;
unsigned
long
value
;
value
=
gmx_atomic_cmpxchg
(
a
,
0
,
1
);
if
(
value
)
{
do
{
while
(
a
->
value
!=
0
)
{
gmx_atomic_memory_barrier
();
}
value
=
gmx_atomic_cmpxchg
(
a
,
0
,
1
);
}
while
(
value
);
}
}
static
inline
int
gmx_spinlock_trylock
(
gmx_spinlock_t
*
x
)
{
return
(
gmx_atomic_cmpxchg
((
gmx_atomic_t
*
)
x
,
0
,
1
)
!=
0
);
}
static
inline
void
gmx_spinlock_unlock
(
gmx_spinlock_t
*
x
)
{
do
{
gmx_atomic_memory_barrier
();
x
->
lock
=
0
;
}
while
(
0
);
}
static
inline
int
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
return
(
x
->
lock
!=
0
);
}
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
do
{
gmx_atomic_memory_barrier
();
}
while
(
gmx_spinlock_islocked
(
x
));
}
#undef gmx_ia64_fetchadd
#elif (defined(__hpux) || defined(__HP_cc)) && defined(__ia64)
/* HP compiler on ia64 */
#include <machine/sys/inline.h>
#define gmx_atomic_memory_barrier() _Asm_mf()
#define gmx_hpia64_fetchadd(a, i) \
_Asm_fetchadd((_Asm_fasz)_FASZ_W,(_Asm_sem)_SEM_REL, \
(UInt32*)a,(unsigned int) i, \
(_Asm_ldhint)LDHINT_NONE)
typedef
struct
gmx_atomic
{
volatile
int
value
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t
;
typedef
struct
gmx_spinlock
{
volatile
unsigned
int
lock
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t
;
static
inline
int
gmx_atomic_cmpxchg
(
gmx_atomic_t
*
a
,
int
oldval
,
int
newval
)
{
int
ret
;
_Asm_mov_to_ar
((
_Asm_app_reg
)
_AREG_CCV
,(
Uint32
)
oldval
,
(
_Asm_fence
)(
_UP_CALL_FENCE
|
_UP_SYS_FENCE
|
_DOWN_CALL_FENCE
|
_DOWN_SYS_FENCE
));
ret
=
_Asm_cmpxchg
((
_Asm_sz
)
SZ_W
,(
_Asm_sem
)
_SEM_ACQ
,(
Uint32
*
)
a
,
(
Uint32
)
newval
,(
_Asm_ldhint
)
_LDHINT_NONE
);
return
ret
;
}
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
static
inline
void
gmx_atomic_add_return
(
gmx_atomic_t
*
a
,
int
i
)
{
int
old
,
new
;
int
__i
=
i
;
/* On HP-UX we don't know any macro to determine whether the increment
* is known at compile time, but hopefully the call uses something simple
* like a constant, and then the optimizer should be able to do the job.
*/
if
(
(
__i
==
1
)
||
(
__i
==
4
)
||
(
__i
==
8
)
||
(
__i
==
16
)
||
(
__i
==
-
1
)
||
(
__i
==
-
4
)
||
(
__i
==
-
8
)
||
(
__i
==
-
16
)
)
{
oldval
=
gmx_hpia64_fetchadd
(
a
,
__i
);
newval
=
oldval
+
i
;
}
else
{
/* Use compare-exchange addition that works with any value */
do
{
oldval
=
gmx_atomic_read
(
a
);
newval
=
oldval
+
i
;
}
while
(
gmx_atomic_cmpxchg
(
a
,
oldval
,
newval
)
!=
oldval
);
}
return
newval
;
}
static
inline
int
gmx_atomic_fetch_add
(
gmx_atomic_t
*
a
,
int
i
)
{
int
oldval
,
newval
;
int
__i
=
i
;
/* On HP-UX we don't know any macro to determine whether the increment
* is known at compile time, but hopefully the call uses something simple
* like a constant, and then the optimizer should be able to do the job.
*/
if
(
(
__i
==
1
)
||
(
__i
==
4
)
||
(
__i
==
8
)
||
(
__i
==
16
)
||
(
__i
==
-
1
)
||
(
__i
==
-
4
)
||
(
__i
==
-
8
)
||
(
__i
==
-
16
)
)
{
oldval
=
gmx_hpia64_fetchadd
(
a
,
__i
);
newval
=
oldval
+
i
;
}
else
{
/* Use compare-exchange addition that works with any value */
do
{
oldval
=
gmx_atomic_read
(
a
);
newval
=
oldval
+
i
;
}
while
(
gmx_atomic_cmpxchg
(
a
,
oldval
,
newval
)
!=
oldval
);
}
return
oldval
;
}
static
inline
void
gmx_spinlock_init
(
gmx_spinlock_t
*
x
)
{
x
->
lock
=
0
;
}
static
inline
void
gmx_spinlock_trylock
(
gmx_spinlock_t
*
x
)
{
int
rc
;
rc
=
_Asm_xchg
((
_Asm_sz
)
_SZ_W
,
(
unsigned
int
*
)
x
,
1
(
_Asm_ldhit
)
_LDHINT_NONE
);
return
(
(
rc
>
0
)
?
1
:
0
);
}
static
inline
void
gmx_spinlock_lock
(
gmx_spinlock_t
*
x
)
{
int
status
=
1
;
do
{
if
(
*
((
unsigned
int
*
)
x
->
lock
)
==
0
)
{
status
=
gmx_spinlock_trylock
(
x
);
}
}
while
(
status
!=
0
);
}
static
inline
void
gmx_spinlock_unlock
(
gmx_spinlock_t
*
x
)
{
_Asm_fetchadd
((
_Asm_fasz
)
_SZ_W
,(
_Asm_sem
)
_SEM_REL
,
(
unsigned
int
*
)
x
,
-
1
,(
_Asm_ldhint
)
_LDHINT_NONE
);
}
static
inline
void
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
return
(
x
->
lock
!=
0
);
}
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
do
{
gmx_atomic_memory_barrier
();
}
while
(
gmx_spinlock_islocked
(
x
));
}
#undef gmx_hpia64_fetchadd
#elif (defined(_MSC_VER) && (_MSC_VER >= 1200))
/* Microsoft Visual C on x86, define taken from FFTW who got it from Morten Nissov */
#include <windows.h>
#define gmx_atomic_memory_barrier()
typedef
struct
gmx_atomic
{
LONG
volatile
value
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t
;
typedef
struct
gmx_spinlock
{
LONG
volatile
lock
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t
;
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
#define gmx_atomic_fetch_add(a, i) \
InterlockedExchangeAdd((LONG volatile *)a, (LONG) i)
#define gmx_atomic_add_return(a, i) \
( i + InterlockedExchangeAdd((LONG volatile *)a, (LONG) i) )
#define gmx_atomic_cmpxchg(a, oldval, newval) \
InterlockedCompareExchange((LONG volatile *)a, (LONG) newval, (LONG) oldval)
# define gmx_spinlock_lock(x) \
while((InterlockedCompareExchange((LONG volatile *)&x, 1, 0))!=0)
#define gmx_spinlock_trylock(x) \
InterlockedCompareExchange((LONG volatile *)&x, 1, 0)
static
inline
void
gmx_spinlock_unlock
(
gmx_spinlock_t
*
x
)
{
x
->
lock
=
0
;
}
static
inline
int
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
return
(
*
(
volatile
signed
char
*
)(
&
(
x
)
->
lock
)
!=
0
);
}
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
while
(
gmx_spinlock_islocked
(
x
))
{
Sleep
(
0
);
}
}
#elif defined(__xlC__) && defined (_AIX)
/* IBM xlC compiler on AIX */
#include <sys/atomic_op.h>
#define gmx_atomic_memory_barrier()
typedef
struct
gmx_atomic
{
volatile
int
value
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t
;
typedef
struct
gmx_spinlock
{
volatile
unsigned
int
lock
;
/*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t
;
static
inline
int
gmx_atomic_cmpxchg
(
gmx_atomic_t
*
a
,
int
oldval
,
int
newval
)
{
int
t
;
if
(
__check_lock
((
atomic_p
)
&
a
->
value
,
oldval
,
newval
))
{
/* Not successful - value had changed in memory. Reload value. */
t
=
a
->
value
;
}
else
{
/* replacement suceeded */
t
=
oldval
;
}
return
t
;
}
static
inline
void
gmx_atomic_add_return
(
gmx_atomic_t
*
a
,
int
i
)
{
int
oldval
,
newval
;
do
{
oldval
=
gmx_atomic_read
(
a
);
newval
=
oldval
+
i
;
}
while
(
__check_lock
((
atomic_p
)
&
a
->
value
,
oldval
,
newval
));
return
newval
;
}
static
inline
void
gmx_atomic_fetch_add
(
gmx_atomic_t
*
a
,
int
i
)
{
int
oldval
,
newval
;
do
{
oldval
=
gmx_atomic_read
(
a
);
newval
=
oldval
+
i
;
}
while
(
__check_lock
((
atomic_p
)
&
a
->
value
,
oldval
,
newval
));
return
oldval
;
}
static
inline
void
gmx_spinlock_init
(
gmx_spinlock_t
*
x
)
{
__clear_lock
((
atomic_p
)
x
,
0
);
}
static
inline
void
gmx_spinlock_lock
(
gmx_spinlock_t
*
x
)
{
do
{
;
}
while
(
__check_lock
((
atomic_p
)
x
,
0
,
1
));
}
static
inline
void
gmx_spinlock_trylock
(
gmx_spinlock_t
*
x
)
{
/* Return 0 if we got the lock */
return
(
__check_lock
((
atomic_p
)
x
,
0
,
1
)
!=
0
)
}
static
inline
void
gmx_spinlock_unlock
(
gmx_spinlock_t
*
x
)
{
__clear_lock
((
atomic_p
)
x
,
0
);
}
static
inline
void
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
return
(
*
((
atomic_p
)
x
)
!=
0
);
}
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
while
(
gmx_spinlock_islocked
(
x
))
{
;
}
}
#else
/* No atomic operations, use mutex fallback. Documentation is in x86 section */
#define gmx_atomic_memory_barrier()
/* System mutex used for locking to guarantee atomicity */
static
pthread_mutex_t
gmx_atomic_mutex
=
PTHREAD_MUTEX_INITIALIZER
;
typedef
struct
gmx_atomic
{
int
value
;
}
gmx_atomic_t
;
#define gmx_spinlock_t pthread_mutex_t
# define GMX_SPINLOCK_INITIALIZER PTHREAD_MUTEX_INITIALIZER
/* Since mutexes guarantee memory barriers this works fine */
#define gmx_atomic_read(a) ((a)->value)
static
inline
void
gmx_atomic_set
(
gmx_atomic_t
*
a
,
int
i
)
{
/* Mutexes here are necessary to guarantee memory visibility */
pthread_mutex_lock
(
&
gmx_atomic_mutex
);
a
->
value
=
i
;
pthread_mutex_unlock
(
&
gmx_atomic_mutex
);
}
static
inline
int
gmx_atomic_add_return
(
gmx_atomic_t
*
a
,
int
i
)
{
int
t
;
pthread_mutex_lock
(
&
gmx_atomic_mutex
);
t
=
a
->
value
+
i
;
a
->
value
=
t
;
pthread_mutex_unlock
(
&
gmx_atomic_mutex
);
return
t
;
}
static
inline
int
gmx_atomic_fetch_add
(
gmx_atomic_t
*
a
,
int
i
)
{
int
old_value
;
pthread_mutex_lock
(
&
gmx_atomic_mutex
);
old_value
=
a
->
value
;
a
->
value
=
old_value
+
i
;
pthread_mutex_unlock
(
&
gmx_atomic_mutex
);
return
old_value
;
}
static
inline
int
gmx_atomic_cmpxchg
(
gmx_atomic_t
*
a
,
int
oldv
,
int
newv
)
{
int
t
;
pthread_mutex_lock
(
&
gmx_atomic_mutex
);
t
=
a
->
value
;
if
(
t
==
oldv
)
{
a
->
value
=
newv
;
}
pthread_mutex_unlock
(
&
gmx_atomic_mutex
);
return
t
;
}
#define gmx_spinlock_init(lock) pthread_mutex_init(lock)
#define gmx_spinlock_lock(lock) pthread_mutex_lock(lock)
#define gmx_spinlock_trylock(lock) pthread_mutex_trylock(lock)
#define gmx_spinlock_unlock(lock) pthread_mutex_unlock(lock)
static
inline
int
gmx_spinlock_islocked
(
gmx_spinlock_t
*
x
)
{
int
rc
;
if
(
gmx_spinlock_trylock
(
x
)
!=
0
)
{
/* It was locked */
return
1
;
}
else
{
/* We just locked it */
gmx_spinlock_unlock
(
x
);
return
0
;
}
}
static
inline
void
gmx_spinlock_wait
(
gmx_spinlock_t
*
x
)
{
int
rc
;
gmx_spinlock_lock
(
x
);
/* Got the lock now, so the waiting is over */
gmx_spinlock_unlock
(
x
);
}
#endif
/*! \brief Spinlock-based barrier type
*
* This barrier has the same functionality as the standard
* gmx_thread_barrier_t, but since it is based on spinlocks
* it provides faster synchronization at the cost of busy-waiting.
*
* Variables of this type should be initialized by calling
* gmx_spinlock_barrier_init() to set the number of threads
* that should be synchronized.
*/
typedef
struct
gmx_spinlock_barrier
{
gmx_atomic_t
count
;
/*!< Number of threads remaining */
int
threshold
;
/*!< Total number of threads */
volatile
int
cycle
;
/*!< Current cycle (alternating 0/1) */
}
gmx_spinlock_barrier_t
;
/*! \brief Initialize spinlock-based barrier
*
* \param barrier Pointer to _spinlock_ barrier. Note that this is not
* the same datatype as the full, thread based, barrier.
* \param count Number of threads to synchronize. All threads
* will be released after \a count calls to
* gmx_spinlock_barrier_wait().
*/
static
inline
void
gmx_spinlock_barrier_init
(
gmx_spinlock_barrier_t
*
barrier
,
int
count
)
{
barrier
->
threshold
=
count
;
barrier
->
cycle
=
0
;
gmx_atomic_set
(
&
(
barrier
->
count
),
count
);
}
/*! \brief Perform busy-waiting barrier synchronization
*
* This routine blocks until it has been called N times,
* where N is the count value the barrier was initialized with.
* After N total calls all threads return. The barrier automatically
* cycles, and thus requires another N calls to unblock another time.
*
* Note that spinlock-based barriers are completely different from
* standard ones (using mutexes and condition variables), only the
* functionality and names are similar.
*
* \param barrier Pointer to previously create barrier.
*
* \return The last thread returns -1, all the others 0.
*/
static
inline
int
gmx_spinlock_barrier_wait
(
gmx_spinlock_barrier_t
*
barrier
)
{
int
cycle
;
int
status
;
/* We don't need to lock or use atomic ops here, since the cycle index
* cannot change until after the last thread has performed the check
* further down. Further, they cannot reach this point in the next
* barrier iteration until all of them have been released, and that
* happens after the cycle value has been updated.
*
* No synchronization == fast synchronization.
*/
cycle
=
barrier
->
cycle
;
/* Decrement the count atomically and check if it is zero.
* This will only be true for the last thread calling us.
*/
if
(
gmx_atomic_add_return
(
&
(
barrier
->
count
),
-
1
)
==
0
)
{
gmx_atomic_set
(
&
(
barrier
->
count
),
barrier
->
threshold
);
barrier
->
cycle
=
!
barrier
->
cycle
;
status
=
-
1
;
}
else
{
/* Wait until the last thread changes the cycle index.
* We are both using a memory barrier, and explicit
* volatile pointer cast to make sure the compiler
* doesn't try to be smart and cache the contents.
*/
do
{
gmx_atomic_memory_barrier
();
}
while
(
*
(
volatile
int
*
)(
&
(
barrier
->
cycle
))
==
cycle
);
status
=
0
;
}
return
status
;
}
#ifdef __cplusplus
}
#endif
#endif
/* _GMX_ATOMIC_H_ */
platforms/cpu/tests/CMakeLists.txt
View file @
8e656070
...
@@ -11,7 +11,6 @@ IF( INCLUDE_SERIALIZATION )
...
@@ -11,7 +11,6 @@ IF( INCLUDE_SERIALIZATION )
INCLUDE_DIRECTORIES
(
${
OPENMM_DIR
}
/serialization/include
)
INCLUDE_DIRECTORIES
(
${
OPENMM_DIR
}
/serialization/include
)
SET
(
SHARED_OPENMM_SERIALIZATION
"OpenMMSerialization"
)
SET
(
SHARED_OPENMM_SERIALIZATION
"OpenMMSerialization"
)
ENDIF
(
INCLUDE_SERIALIZATION
)
ENDIF
(
INCLUDE_SERIALIZATION
)
GET_PROPERTY
(
COMPILE_FLAGS GLOBAL PROPERTY COMPILE_FLAGS
)
# Automatically create tests using files named "Test*.cpp"
# Automatically create tests using files named "Test*.cpp"
FILE
(
GLOB TEST_PROGS
"*Test*.cpp"
)
FILE
(
GLOB TEST_PROGS
"*Test*.cpp"
)
...
@@ -21,7 +20,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
...
@@ -21,7 +20,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
# Link with shared library
# Link with shared library
ADD_EXECUTABLE
(
${
TEST_ROOT
}
${
TEST_PROG
}
)
ADD_EXECUTABLE
(
${
TEST_ROOT
}
${
TEST_PROG
}
)
TARGET_LINK_LIBRARIES
(
${
TEST_ROOT
}
${
SHARED_TARGET
}
)
TARGET_LINK_LIBRARIES
(
${
TEST_ROOT
}
${
SHARED_TARGET
}
)
SET_TARGET_PROPERTIES
(
${
TEST_ROOT
}
PROPERTIES
COMPILE
_FLAGS
"
${
COMPILE_FLAGS
}
"
LINK
_FLAGS
"
${
COMPILE_FLAGS
}
"
)
SET_TARGET_PROPERTIES
(
${
TEST_ROOT
}
PROPERTIES
LINK
_FLAGS
"
${
EXTRA_
COMPILE_FLAGS
}
"
COMPILE
_FLAGS
"
${
EXTRA_
COMPILE_FLAGS
}
"
)
ADD_TEST
(
${
TEST_ROOT
}
${
EXECUTABLE_OUTPUT_PATH
}
/
${
TEST_ROOT
}
single
)
ADD_TEST
(
${
TEST_ROOT
}
${
EXECUTABLE_OUTPUT_PATH
}
/
${
TEST_ROOT
}
single
)
ENDFOREACH
(
TEST_PROG
${
TEST_PROGS
}
)
ENDFOREACH
(
TEST_PROG
${
TEST_PROGS
}
)
platforms/cpu/tests/TestCpuNeighborList.cpp
View file @
8e656070
...
@@ -35,6 +35,7 @@
...
@@ -35,6 +35,7 @@
#include "openmm/internal/AssertionUtilities.h"
#include "openmm/internal/AssertionUtilities.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/ThreadPool.h"
#include "AlignedArray.h"
#include "CpuNeighborList.h"
#include "CpuNeighborList.h"
#include "CpuPlatform.h"
#include "CpuPlatform.h"
#include "sfmt/SFMT.h"
#include "sfmt/SFMT.h"
...
@@ -52,7 +53,7 @@ void testNeighborList(bool periodic) {
...
@@ -52,7 +53,7 @@ void testNeighborList(bool periodic) {
const
float
boxSize
[
3
]
=
{
20.0
f
,
15.0
f
,
22.0
f
};
const
float
boxSize
[
3
]
=
{
20.0
f
,
15.0
f
,
22.0
f
};
OpenMM_SFMT
::
SFMT
sfmt
;
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
init_gen_rand
(
0
,
sfmt
);
vector
<
float
>
positions
(
4
*
numParticles
);
AlignedArray
<
float
>
positions
(
4
*
numParticles
);
for
(
int
i
=
0
;
i
<
4
*
numParticles
;
i
++
)
for
(
int
i
=
0
;
i
<
4
*
numParticles
;
i
++
)
if
(
i
%
4
<
3
)
if
(
i
%
4
<
3
)
positions
[
i
]
=
boxSize
[
i
%
4
]
*
genrand_real2
(
sfmt
);
positions
[
i
]
=
boxSize
[
i
%
4
]
*
genrand_real2
(
sfmt
);
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment