Commit 588a4c9d authored by peastman's avatar peastman
Browse files

Merged changes from main branch

parents bbf3de23 ff6af025
......@@ -105,16 +105,20 @@ ELSE( CMAKE_SIZEOF_VOID_P EQUAL 8 )
SET( LIB64 )
ENDIF( CMAKE_SIZEOF_VOID_P EQUAL 8 )
# Build universal binaries compatible with OS X 10.7
IF (APPLE AND NOT CMAKE_OSX_DEPLOYMENT_TARGET)
SET (CMAKE_OSX_ARCHITECTURES "i386;x86_64" CACHE STRING "The processor architectures to build for" FORCE)
SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.7" CACHE STRING "The minimum version of OS X to support" FORCE)
ENDIF (APPLE AND NOT CMAKE_OSX_DEPLOYMENT_TARGET)
# Improve the linking behavior of Mac libraries
IF (APPLE)
# Build universal binaries compatible with OS X 10.7
IF (NOT CMAKE_OSX_DEPLOYMENT_TARGET)
SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.7" CACHE STRING "The minimum version of OS X to support" FORCE)
ENDIF (NOT CMAKE_OSX_DEPLOYMENT_TARGET)
IF (NOT CMAKE_OSX_ARCHITECTURES)
SET (CMAKE_OSX_ARCHITECTURES "i386;x86_64" CACHE STRING "The processor architectures to build for" FORCE)
ENDIF (NOT CMAKE_OSX_ARCHITECTURES)
# Improve the linking behavior of Mac libraries
SET (CMAKE_INSTALL_NAME_DIR "@rpath")
SET_PROPERTY(GLOBAL PROPERTY COMPILE_FLAGS "-stdlib=libc++ -mmacosx-version-min=10.7")
SET(EXTRA_COMPILE_FLAGS "-stdlib=libc++")
ELSE (APPLE)
SET(EXTRA_COMPILE_FLAGS)
ENDIF (APPLE)
IF(UNIX AND NOT CMAKE_BUILD_TYPE)
......@@ -266,7 +270,7 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY -DOPENMM_VALIDATE_BUILDING_SHARED_LIBRARY")
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY -DOPENMM_VALIDATE_BUILDING_SHARED_LIBRARY")
IF(WIN32)
ADD_DEPENDENCIES(${SHARED_TARGET} PthreadsLibraries)
ENDIF(WIN32)
......@@ -274,7 +278,7 @@ ENDIF(WIN32)
SET(OPENMM_BUILD_STATIC_LIB OFF CACHE BOOL "Whether to build static OpenMM libraries")
IF(OPENMM_BUILD_STATIC_LIB)
ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY -DOPENMMM_VALIDATE_BUILDING_STATIC_LIBRARY -DOPENMM_VALIDATE_BUILDING_STATIC_LIBRARY")
SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY -DOPENMMM_VALIDATE_BUILDING_STATIC_LIBRARY -DOPENMM_VALIDATE_BUILDING_STATIC_LIBRARY")
ENDIF(OPENMM_BUILD_STATIC_LIB)
IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
......
......@@ -26,14 +26,16 @@ SET(BUILD_TESTING_STATIC OFF)
IF(OPENMM_BUILD_STATIC_LIB)
SET(BUILD_TESTING_STATIC ON)
ENDIF(OPENMM_BUILD_STATIC_LIB)
FOREACH(EX_ROOT ${CPP_EXAMPLES})
IF (BUILD_TESTING_SHARED)
# Link with shared library
ADD_EXECUTABLE(${EX_ROOT} ${EX_ROOT}.cpp)
SET_TARGET_PROPERTIES(${EX_ROOT}
PROPERTIES
PROJECT_LABEL "Example - ${EX_ROOT}")
PROPERTIES
PROJECT_LABEL "Example - ${EX_ROOT}"
LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
TARGET_LINK_LIBRARIES(${EX_ROOT} ${SHARED_TARGET})
ENDIF (BUILD_TESTING_SHARED)
......@@ -42,9 +44,10 @@ FOREACH(EX_ROOT ${CPP_EXAMPLES})
SET(EX_STATIC ${EX_ROOT}Static)
ADD_EXECUTABLE(${EX_STATIC} ${EX_ROOT}.cpp)
SET_TARGET_PROPERTIES(${EX_STATIC}
PROPERTIES
COMPILE_FLAGS "-DOPENMM_USE_STATIC_LIBRARIES"
PROJECT_LABEL "Example - ${EX_STATIC}")
PROPERTIES
PROJECT_LABEL "Example - ${EX_STATIC}"
LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_USE_STATIC_LIBRARIES")
TARGET_LINK_LIBRARIES(${EX_STATIC} ${STATIC_TARGET})
ENDIF (BUILD_TESTING_STATIC)
......@@ -63,8 +66,10 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
# C++ libraries on the link line.
ADD_EXECUTABLE(${EX_ROOT} ${EX_ROOT}.c Empty.cpp)
SET_TARGET_PROPERTIES(${EX_ROOT}
PROPERTIES
PROJECT_LABEL "Example C - ${EX_ROOT}")
PROPERTIES
PROJECT_LABEL "Example C - ${EX_ROOT}"
LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
TARGET_LINK_LIBRARIES(${EX_ROOT} ${SHARED_TARGET})
ADD_DEPENDENCIES(${EX_ROOT} ApiWrappers)
ENDIF (BUILD_TESTING_SHARED)
......@@ -76,9 +81,10 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
# C++ libraries on the static link line.
ADD_EXECUTABLE(${EX_STATIC} ${EX_ROOT}.c Empty.cpp)
SET_TARGET_PROPERTIES(${EX_STATIC}
PROPERTIES
COMPILE_FLAGS "-DOPENMM_USE_STATIC_LIBRARIES"
PROJECT_LABEL "Example C - ${EX_STATIC}")
PROPERTIES
PROJECT_LABEL "Example C - ${EX_STATIC}"
LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_USE_STATIC_LIBRARIES")
TARGET_LINK_LIBRARIES(${EX_STATIC} ${STATIC_TARGET})
ADD_DEPENDENCIES(${EX_STATIC} ApiWrappers)
ENDIF (BUILD_TESTING_STATIC)
......
......@@ -91,6 +91,9 @@ public:
fvec4 operator&(fvec4 other) const {
return _mm_and_ps(val, other);
}
fvec4 operator|(fvec4 other) const {
return _mm_or_ps(val, other);
}
fvec4 operator==(fvec4 other) const {
return _mm_cmpeq_ps(val, other);
}
......@@ -157,6 +160,9 @@ public:
ivec4 operator&(ivec4 other) const {
return _mm_and_si128(val, other);
}
ivec4 operator|(ivec4 other) const {
return _mm_or_si128(val, other);
}
ivec4 operator==(ivec4 other) const {
return _mm_cmpeq_epi32(val, other);
}
......@@ -267,5 +273,11 @@ static inline fvec4 operator/(float v1, fvec4 v2) {
return fvec4(v1)/v2;
}
// Operations for blending fvec4s based on an ivec4.
static inline fvec4 blend(fvec4 v1, fvec4 v2, ivec4 mask) {
return fvec4(_mm_blendv_ps(v1.val, v2.val, _mm_castsi128_ps(mask.val)));
}
#endif /*OPENMM_VECTORIZE_H_*/
......@@ -14,10 +14,6 @@
# libOpenMMCPU_static[_d].a
#----------------------------------------------------
IF (APPLE)
SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.6")
ENDIF (APPLE)
SUBDIRS (tests)
# The source is organized into subdirectories, but we handle them all from
......
#ifndef OPENMM_ALIGNEDARRAY_H_
#define OPENMM_ALIGNEDARRAY_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
namespace OpenMM {
/**
* This class represents an array in memory whose starting point is guaranteed to
* be aligned with a 16 byte boundary. This can improve the performance of vectorized
* code, since loads and stores are more efficient.
*/
template <class T>
class AlignedArray {
public:
/**
* Default constructor, to allow AlignedArrays to be used inside collections.
*/
AlignedArray() : dataSize(0), baseData(0), data(0) {
}
/**
* Create an Aligned array that contains a specified number of elements.
*/
AlignedArray(int size) {
allocate(size);
}
~AlignedArray() {
if (baseData != 0)
delete[] baseData;
}
/**
* Get the number of elements in the array.
*/
int size() const {
return dataSize;
}
/**
* Change the size of the array. This may cause all contents to be lost.
*/
void resize(int size) {
if (dataSize == size)
return;
if (baseData != 0)
delete[] baseData;
allocate(size);
}
/**
* Get a reference to an element of the array.
*/
T& operator[](int i) {
return data[i];
}
/**
* Get a const reference to an element of the array.
*/
const T& operator[](int i) const {
return data[i];
}
private:
void allocate(int size) {
dataSize = size;
baseData = new char[size*sizeof(T)+16];
char* offsetData = baseData+15;
offsetData -= (long long)offsetData&0xF;
data = (T*) offsetData;
}
int dataSize;
char* baseData;
T* data;
};
} // namespace OpenMM
#endif /*OPENMM_ALIGNEDARRAY_H_*/
......@@ -25,6 +25,7 @@
#ifndef OPENMM_CPU_GBSAOBC_FORCE_H__
#define OPENMM_CPU_GBSAOBC_FORCE_H__
#include "AlignedArray.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/vectorize.h"
#include <set>
......@@ -84,7 +85,7 @@ public:
* @param totalEnergy total energy
* @param threads the thread pool to use
*/
void computeForce(const std::vector<float>& posq, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
void computeForce(const AlignedArray<float>& posq, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
/**
* This routine contains the code executed by each thread.
......@@ -105,10 +106,13 @@ private:
float logDX, logDXInv;
// The following variables are used to make information accessible to the individual threads.
float const* posq;
std::vector<std::vector<float> >* threadForce;
std::vector<AlignedArray<float> >* threadForce;
bool includeEnergy;
void* atomicCounter;
static const int NUM_TABLE_POINTS;
static const float TABLE_MIN;
static const float TABLE_MAX;
/**
* Compute the displacement and squared distance between a collection of points, optionally using
......
......@@ -32,6 +32,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "AlignedArray.h"
#include "windowsExportCpu.h"
#include "openmm/internal/ThreadPool.h"
#include <set>
......@@ -46,7 +47,7 @@ public:
class Voxels;
static const int BlockSize;
CpuNeighborList();
void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
void computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
int getNumBlocks() const;
const std::vector<int>& getSortedAtoms() const;
......
......@@ -25,6 +25,7 @@
#ifndef OPENMM_CPU_NONBONDED_FORCE_H__
#define OPENMM_CPU_NONBONDED_FORCE_H__
#include "AlignedArray.h"
#include "CpuNeighborList.h"
#include "ReferencePairIxn.h"
#include "openmm/internal/ThreadPool.h"
......@@ -143,7 +144,7 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */
void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
const std::vector<std::set<int> >& exclusions, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
/**
* This routine contains the code executed by each thread.
......@@ -156,6 +157,7 @@ private:
bool periodic;
bool ewald;
bool pme;
bool tableIsValid;
const CpuNeighborList* neighborList;
float periodicBoxSize[3];
float cutoffDistance, switchingDistance;
......@@ -172,8 +174,9 @@ private:
RealVec const* atomCoordinates;
std::pair<float, float> const* atomParameters;
std::set<int> const* exclusions;
std::vector<std::vector<float> >* threadForce;
std::vector<AlignedArray<float> >* threadForce;
bool includeEnergy;
void* atomicCounter;
static const float TWO_OVER_SQRT_PI;
static const int NUM_TABLE_POINTS;
......
......@@ -32,6 +32,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "AlignedArray.h"
#include "ReferencePlatform.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/ThreadPool.h"
......@@ -69,8 +70,8 @@ private:
class CpuPlatform::PlatformData {
public:
PlatformData(int numParticles);
std::vector<float> posq;
std::vector<std::vector<float> > threadForce;
AlignedArray<float> posq;
std::vector<AlignedArray<float> > threadForce;
ThreadPool threads;
bool isPeriodic;
};
......
#ifndef OPENMM_CPUSETTLE_H_
#define OPENMM_CPUSETTLE_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "ReferenceSETTLEAlgorithm.h"
#include "openmm/System.h"
#include "openmm/internal/ThreadPool.h"
#include <vector>
namespace OpenMM {
/**
* This class uses multiple ReferenceSETTLEAlgorithm objects to execute the algorithm in parallel.
*/
class OPENMM_EXPORT CpuSETTLE : public ReferenceConstraintAlgorithm {
public:
class ApplyToPositionsTask;
class ApplyToVelocitiesTask;
CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads);
~CpuSETTLE();
/**
* Apply the constraint algorithm.
*
* @param atomCoordinates the original atom coordinates
* @param atomCoordinatesP the new atom coordinates
* @param inverseMasses 1/mass
* @param tolerance the constraint tolerance
*/
void apply(std::vector<OpenMM::RealVec>& atomCoordinates, std::vector<OpenMM::RealVec>& atomCoordinatesP, std::vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance);
/**
* Apply the constraint algorithm to velocities.
*
* @param atomCoordinates the atom coordinates
* @param atomCoordinatesP the velocities to modify
* @param inverseMasses 1/mass
* @param tolerance the constraint tolerance
*/
void applyToVelocities(std::vector<OpenMM::RealVec>& atomCoordinates, std::vector<OpenMM::RealVec>& velocities, std::vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance);
private:
std::vector<ReferenceSETTLEAlgorithm*> threadSettle;
ThreadPool& threads;
};
} // namespace OpenMM
#endif /*OPENMM_CPUSETTLE_H_*/
GET_PROPERTY(COMPILE_FLAGS GLOBAL PROPERTY COMPILE_FLAGS)
SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} -msse4.1")
SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
......@@ -8,6 +7,6 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB})
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_CPU_BUILDING_SHARED_LIBRARY" LINK_FLAGS "${COMPILE_FLAGS}")
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CPU_BUILDING_SHARED_LIBRARY")
INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
......@@ -24,14 +24,16 @@
#include "CpuGBSAOBCForce.h"
#include "SimTKOpenMMRealType.h"
#include "openmm/internal/SplineFitter.h"
#include "openmm/internal/vectorize.h"
#include "gmx_atomic.h"
#include <cmath>
using namespace std;
using namespace OpenMM;
const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 1025;
const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 4096;
const float CpuGBSAOBCForce::TABLE_MIN = 0.25f;
const float CpuGBSAOBCForce::TABLE_MAX = 1.5f;
class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
public:
......@@ -44,22 +46,12 @@ public:
};
CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
logDX = 0.5/NUM_TABLE_POINTS;
logDX = (TABLE_MAX-TABLE_MIN)/NUM_TABLE_POINTS;
logDXInv = 1.0f/logDX;
vector<double> x(NUM_TABLE_POINTS+1);
vector<double> y(NUM_TABLE_POINTS+1);
vector<double> deriv;
for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
x[i] = 0.5+i*0.5/NUM_TABLE_POINTS;
y[i] = log(x[i]);
}
SplineFitter::createNaturalSpline(x, y, deriv);
logTable.resize(4*NUM_TABLE_POINTS);
for (int i = 0; i < NUM_TABLE_POINTS; i++) {
logTable[4*i] = (float) y[i];
logTable[4*i+1] = (float) y[i+1];
logTable[4*i+2] = (float) (deriv[i]*logDX*logDX/6);
logTable[4*i+3] = (float) (deriv[i+1]*logDX*logDX/6);
logTable.resize(NUM_TABLE_POINTS+4);
for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
double x = TABLE_MIN+i*logDX;
logTable[i] = log(x);
}
}
......@@ -93,7 +85,7 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f
obcChain.resize(params.size()+3);
}
void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
// Record the parameters for the threads.
this->posq = &posq[0];
......@@ -104,16 +96,22 @@ void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector
threadBornForces.resize(numThreads);
for (int i = 0; i < numThreads; i++)
threadBornForces[i].resize(particleParams.size()+3);
gmx_atomic_t counter;
this->atomicCounter = &counter;
// Signal the threads to start running and wait for them to finish.
ComputeTask task(*this);
gmx_atomic_set(&counter, 0);
threads.execute(task);
threads.waitForThreads(); // Compute Born radii
gmx_atomic_set(&counter, 0);
threads.resumeThreads();
threads.waitForThreads(); // Compute surface area term
gmx_atomic_set(&counter, 0);
threads.resumeThreads();
threads.waitForThreads(); // First loop
gmx_atomic_set(&counter, 0);
threads.resumeThreads();
threads.waitForThreads(); // Second loop
......@@ -141,8 +139,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
// Calculate Born radii
for (int blockStart = start; blockStart < end; blockStart += 4) {
int numInBlock = min(4, end-blockStart);
while (true) {
int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
if (blockStart >= numParticles)
break;
int numInBlock = min(4, numParticles-blockStart);
ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
float atomRadius[4], atomx[4], atomy[4], atomz[4];
int blockMask[4] = {0, 0, 0, 0};
......@@ -213,7 +214,10 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
vector<float>& bornForces = threadBornForces[threadIndex];
for (int i = 0; i < numParticles; i++)
bornForces[i] = 0.0f;
for (int atomI = start; atomI < end; atomI++) {
while (true) {
int atomI = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (atomI >= numParticles)
break;
if (bornRadii[atomI] > 0) {
float radiusI = particleParams[atomI].first + dielectricOffset;
float r = radiusI + probeRadius;
......@@ -235,8 +239,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
preFactor = ONE_4PI_EPS0*((1.0f/solventDielectric) - (1.0f/soluteDielectric));
else
preFactor = 0.0f;
for (int blockStart = start; blockStart < end; blockStart += 4) {
int numInBlock = min(4, end-blockStart);
while (true) {
int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
if (blockStart >= numParticles)
break;
int numInBlock = min(4, numParticles-blockStart);
ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
float atomCharge[4], atomx[4], atomy[4], atomz[4];
int blockMask[4] = {0, 0, 0, 0};
......@@ -303,13 +310,16 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
// Second loop of Born energy computation.
for (int blockStart = start; blockStart < end; blockStart += 4) {
while (true) {
int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
if (blockStart >= numParticles)
break;
fvec4 bornForce(0.0f);
for (int i = 0; i < numThreads; i++)
bornForce += fvec4(&threadBornForces[i][blockStart]);
fvec4 radii(&bornRadii[blockStart]);
bornForce *= radii*radii*fvec4(&obcChain[blockStart]);
int numInBlock = min(4, end-blockStart);
int numInBlock = min(4, numParticles-blockStart);
ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
float atomRadius[4], atomx[4], atomy[4], atomz[4];
int blockMask[4] = {0, 0, 0, 0};
......@@ -351,14 +361,13 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
fvec4 logRatio = fastLog(u_ij/l_ij);
fvec4 t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse;
fvec4 de = bornForce*t3*rInverse;
de = blend(0.0f, de, include);
fvec4 result[4] = {dx*de, dy*de, dz*de, 0.0f};
transpose(result[0], result[1], result[2], result[3]);
fvec4 atomForce(forces+4*atomJ);
for (int j = 0; j < 4; j++) {
if (include[j]) {
blockAtomForce[j] += result[j];
atomForce -= result[j];
}
blockAtomForce[j] += result[j];
atomForce -= result[j];
}
atomForce.store(forces+4*atomJ);
}
......@@ -385,21 +394,16 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
fvec4 CpuGBSAOBCForce::fastLog(fvec4 x) {
// Evaluate log(x) using a lookup table for speed.
float y[4];
fvec4 x1 = (x-0.5f)*logDXInv;
if (any((x < TABLE_MIN) | (x >= TABLE_MAX)))
return fvec4(logf(x[0]), logf(x[1]), logf(x[2]), logf(x[3]));
fvec4 x1 = (x-TABLE_MIN)*logDXInv;
ivec4 index = floor(x1);
fvec4 coeff[4];
coeff[1] = x1-index;
coeff[0] = 1.0f-coeff[1];
coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
static float maxdiff = 0.0f;
for (int i = 0; i < 4; i++) {
if (index[i] >= 0 && index[i] < NUM_TABLE_POINTS)
y[i] = dot4(coeff[i], fvec4(&logTable[4*index[i]]));
else
y[i] = logf(x[i]);
}
return fvec4(y);
fvec4 coeff2 = x1-index;
fvec4 coeff1 = 1.0f-coeff2;
fvec4 t1(&logTable[index[0]]);
fvec4 t2(&logTable[index[1]]);
fvec4 t3(&logTable[index[2]]);
fvec4 t4(&logTable[index[3]]);
transpose(t1, t2, t3, t4);
return coeff1*t1 + coeff2*t2;
}
......@@ -81,16 +81,16 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
// Convert the positions to single precision and apply periodic boundary conditions
vector<float>& posq = data.posq;
AlignedArray<float>& posq = data.posq;
vector<RealVec>& posData = extractPositions(context);
RealVec boxSize = extractBoxSize(context);
float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
double invBoxSize[3] = {1/boxSize[0], 1/boxSize[1], 1/boxSize[2]};
int numParticles = context.getSystem().getNumParticles();
if (data.isPeriodic)
for (int i = 0; i < numParticles; i++)
for (int j = 0; j < 3; j++) {
RealOpenMM x = posData[i][j];
double base = floor(x/boxSize[j])*boxSize[j];
double base = floor(x*invBoxSize[j])*boxSize[j];
posq[4*i+j] = (float) (x-base);
}
else
......@@ -255,12 +255,12 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
}
}
}
vector<float>& posq = data.posq;
AlignedArray<float>& posq = data.posq;
vector<RealVec>& posData = extractPositions(context);
vector<RealVec>& forceData = extractForces(context);
RealVec boxSize = extractBoxSize(context);
float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
double energy = ewaldSelfEnergy;
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
bool ewald = (nonbondedMethod == Ewald);
bool pme = (nonbondedMethod == PME);
if (nonbondedMethod != NoCutoff) {
......@@ -330,7 +330,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
Vec3 periodicBoxSize(boxSize[0], boxSize[1], boxSize[2]);
optimizedPme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, periodicBoxSize, includeEnergy);
optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
}
else
nonbonded.calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
......
......@@ -48,6 +48,8 @@ const int CpuNeighborList::BlockSize = 4;
class VoxelIndex
{
public:
VoxelIndex() : x(0), y(0) {
}
VoxelIndex(int x, int y) : x(x), y(y) {
}
int x;
......@@ -173,6 +175,9 @@ public:
float centerPos[4];
blockCenter.store(centerPos);
VoxelIndex centerVoxelIndex = getVoxelIndex(centerPos);
VoxelIndex atomVoxelIndex[BlockSize];
for (int i = 0; i < (int) blockAtoms.size(); i++)
atomVoxelIndex[i] = getVoxelIndex(&atomLocations[4*blockAtoms[i]]);
int startx = centerVoxelIndex.x-dIndexX;
int starty = centerVoxelIndex.y-dIndexY;
int endx = centerVoxelIndex.x+dIndexX;
......@@ -194,39 +199,59 @@ public:
voxelIndex.x = x;
if (usePeriodic)
voxelIndex.x = (x < 0 ? x+nx : (x >= nx ? x-nx : x));
float dx = max(0.0f, voxelSizeX*max(0, abs(centerVoxelIndex.x-x)-1)-blockWidth[0]);
for (int y = starty; y <= endy; ++y) {
voxelIndex.y = y;
if (usePeriodic)
voxelIndex.y = (y < 0 ? y+ny : (y >= ny ? y-ny : y));
float dy = max(0.0f, voxelSizeY*max(0, abs(centerVoxelIndex.y-y)-1)-blockWidth[1]);
// Identify the range of atoms within this bin we need to search. When using periodic boundary
// conditions, there may be two separate ranges.
float dz = maxDistance+blockWidth[2];
dz = sqrtf(max(0.0f, dz*dz-dx*dx-dy*dy));
float minz = centerPos[2];
float maxz = centerPos[2];
fvec4 offset(voxelSizeX*x+(usePeriodic ? 0.0f : minx), voxelSizeY*y+(usePeriodic ? 0.0f : miny), 0, 0);
for (int k = 0; k < (int) blockAtoms.size(); k++) {
const float* atomPos = &atomLocations[4*blockAtoms[k]];
fvec4 posVec(atomPos);
fvec4 delta1 = offset-posVec;
fvec4 delta2 = delta1+fvec4(voxelSizeX, voxelSizeY, 0, 0);
if (usePeriodic) {
delta1 -= round(delta1*invBoxSize)*boxSize;
delta2 -= round(delta2*invBoxSize)*boxSize;
}
fvec4 delta = min(abs(delta1), abs(delta2));
float dx = (x == atomVoxelIndex[k].x ? 0.0f : delta[0]);
float dy = (y == atomVoxelIndex[k].y ? 0.0f : delta[1]);
float dist2 = maxDistanceSquared-dx*dx-dy*dy;
if (dist2 > 0) {
float dist = sqrtf(dist2);
minz = min(minz, atomPos[2]-dist);
maxz = max(maxz, atomPos[2]+dist);
}
}
if (minz == maxz)
continue;
bool needPeriodic = (centerPos[0]-blockWidth[0] < maxDistance || centerPos[0]+blockWidth[0] > periodicBoxSize[0]-maxDistance ||
centerPos[1]-blockWidth[1] < maxDistance || centerPos[1]+blockWidth[1] > periodicBoxSize[1]-maxDistance ||
centerPos[2]-dz < 0.0f || centerPos[2]+dz > periodicBoxSize[2]);
minz < 0.0f || maxz > periodicBoxSize[2]);
int rangeStart[2];
int rangeEnd[2];
rangeStart[0] = findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz);
rangeStart[0] = findLowerBound(voxelIndex.x, voxelIndex.y, minz);
if (needPeriodic) {
numRanges = 2;
rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, maxz);
if (rangeStart[0] > 0) {
rangeStart[1] = 0;
rangeEnd[1] = min(findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz-periodicBoxSize[2]), rangeStart[0]);
rangeEnd[1] = min(findUpperBound(voxelIndex.x, voxelIndex.y, maxz-periodicBoxSize[2]), rangeStart[0]);
}
else {
rangeStart[1] = max(findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz+periodicBoxSize[2]), rangeEnd[0]);
rangeStart[1] = max(findLowerBound(voxelIndex.x, voxelIndex.y, minz+periodicBoxSize[2]), rangeEnd[0]);
rangeEnd[1] = bins[voxelIndex.x][voxelIndex.y].size();
}
}
else {
numRanges = 1;
rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, maxz);
}
// Loop over atoms and check to see if they are neighbors of this block.
......@@ -307,7 +332,7 @@ public:
CpuNeighborList::CpuNeighborList() {
}
void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions,
void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
blockNeighbors.resize(numBlocks);
......@@ -353,8 +378,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
if (!usePeriodic)
edgeSizeX = edgeSizeY = maxDistance; // TODO - adjust this as needed
else {
edgeSizeX = 0.5f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
edgeSizeY = 0.5f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
edgeSizeX = 0.6f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
edgeSizeY = 0.6f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
}
Voxels voxels(edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
for (int i = 0; i < numAtoms; i++) {
......
......@@ -29,8 +29,8 @@
#include "CpuNonbondedForce.h"
#include "ReferenceForce.h"
#include "ReferencePME.h"
#include "openmm/internal/SplineFitter.h"
#include "openmm/internal/vectorize.h"
#include "gmx_atomic.h"
// In case we're using some primitive version of Visual Studio this will
// make sure that erf() and erfc() are defined.
......@@ -40,7 +40,7 @@ using namespace std;
using namespace OpenMM;
const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
const int CpuNonbondedForce::NUM_TABLE_POINTS = 1025;
const int CpuNonbondedForce::NUM_TABLE_POINTS = 2048;
class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
public:
......@@ -58,21 +58,22 @@ public:
--------------------------------------------------------------------------------------- */
CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false) {
CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false) {
}
/**---------------------------------------------------------------------------------------
/**---------------------------------------------------------------------------------------
Set the force to use a cutoff.
Set the force to use a cutoff.
@param distance the cutoff distance
@param neighbors the neighbor list to use
@param solventDielectric the dielectric constant of the bulk solvent
@param distance the cutoff distance
@param neighbors the neighbor list to use
@param solventDielectric the dielectric constant of the bulk solvent
--------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
if (distance != cutoffDistance)
tableIsValid = false;
cutoff = true;
cutoffDistance = distance;
neighborList = &neighbors;
......@@ -127,6 +128,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
--------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
if (alpha != alphaEwald)
tableIsValid = false;
alphaEwald = alpha;
numRx = kmaxx;
numRy = kmaxy;
......@@ -145,6 +148,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
--------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
if (alpha != alphaEwald)
tableIsValid = false;
alphaEwald = alpha;
meshDim[0] = meshSize[0];
meshDim[1] = meshSize[1];
......@@ -155,24 +160,16 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
void CpuNonbondedForce::tabulateEwaldScaleFactor() {
ewaldDX = cutoffDistance/(NUM_TABLE_POINTS-2);
if (tableIsValid)
return;
tableIsValid = true;
ewaldDX = cutoffDistance/NUM_TABLE_POINTS;
ewaldDXInv = 1.0f/ewaldDX;
vector<double> x(NUM_TABLE_POINTS+1);
vector<double> y(NUM_TABLE_POINTS+1);
vector<double> deriv;
for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
double r = i*cutoffDistance/(NUM_TABLE_POINTS-2);
ewaldScaleTable.resize(NUM_TABLE_POINTS+4);
for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
double r = i*ewaldDX;
double alphaR = alphaEwald*r;
x[i] = r;
y[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
}
SplineFitter::createNaturalSpline(x, y, deriv);
ewaldScaleTable.resize(4*NUM_TABLE_POINTS);
for (int i = 0; i < NUM_TABLE_POINTS; i++) {
ewaldScaleTable[4*i] = (float) y[i];
ewaldScaleTable[4*i+1] = (float) y[i+1];
ewaldScaleTable[4*i+2] = (float) (deriv[i]*ewaldDX*ewaldDX/6);
ewaldScaleTable[4*i+3] = (float) (deriv[i+1]*ewaldDX*ewaldDX/6);
ewaldScaleTable[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
}
}
......@@ -291,7 +288,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
const vector<set<int> >& exclusions, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
// Record the parameters for the threads.
this->numberOfAtoms = numberOfAtoms;
......@@ -302,6 +299,9 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
this->threadForce = &threadForce;
includeEnergy = (totalEnergy != NULL);
threadEnergy.resize(threads.getNumThreads());
gmx_atomic_t counter;
gmx_atomic_set(&counter, 0);
this->atomicCounter = &counter;
// Signal the threads to start running and wait for them to finish.
......@@ -332,8 +332,12 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
if (ewald || pme) {
// Compute the interactions from the neighbor list.
for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads)
calculateBlockEwaldIxn(i, forces, energyPtr, boxSize, invBoxSize);
while (true) {
int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (nextBlock >= neighborList->getNumBlocks())
break;
calculateBlockEwaldIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
}
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
......@@ -367,13 +371,20 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
else if (cutoff) {
// Compute the interactions from the neighbor list.
for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads)
calculateBlockIxn(i, forces, energyPtr, boxSize, invBoxSize);
while (true) {
int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (nextBlock >= neighborList->getNumBlocks())
break;
calculateBlockIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
}
}
else {
// Loop over all atom pairs
for (int i = threadIndex; i < numberOfAtoms; i += numThreads){
while (true) {
int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (i >= numberOfAtoms)
break;
for (int j = i+1; j < numberOfAtoms; j++)
if (exclusions[j].find(i) == exclusions[j].end())
calculateOneIxn(i, j, forces, energyPtr, boxSize, invBoxSize);
......@@ -461,12 +472,12 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
break;
}
}
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
bool include[4];
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
......@@ -475,43 +486,50 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
// Compute the distances to the block atoms.
bool any = false;
fvec4 dx, dy, dz, r2;
getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
for (int j = 0; j < 4; j++) {
include[j] = (((exclusions[i]>>j)&1) == 0 && (!cutoff || r2[j] < cutoffDistance*cutoffDistance));
any |= include[j];
}
if (!any)
ivec4 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 switchValue(1.0f), switchDeriv(0.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
fvec4 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
fvec4 dEdR = switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
if (cutoff)
dEdR += chargeProd*(inverseR-2.0f*krf*r2);
else
dEdR += chargeProd*inverseR;
dEdR *= inverseR*inverseR;
fvec4 energy = eps*(sig6-1.0f)*sig6;
if (useSwitch) {
dEdR -= energy*switchDeriv*inverseR;
energy *= switchValue;
}
// Accumulate energies.
......@@ -520,27 +538,25 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
energy += chargeProd*(inverseR+krf*r2-crf);
else
energy += chargeProd*inverseR;
for (int j = 0; j < 4; j++)
if (include[j])
*totalEnergy += energy[j];
energy = blend(0.0f, energy, include);
*totalEnergy += dot4(energy, 1.0f);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
transpose(result[0], result[1], result[2], result[3]);
fvec4 atomForce(forces+4*atom);
for (int j = 0; j < 4; j++) {
if (include[j]) {
blockAtomForce[j] += result[j];
atomForce -= result[j];
}
blockAtomForce[j] += result[j];
atomForce -= result[j];
}
atomForce.store(forces+4*atom);
}
// Record the forces on the block atoms.
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
}
......@@ -569,12 +585,12 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
needPeriodic = true;
break;
}
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
bool include[4];
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
......@@ -583,60 +599,65 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
// Compute the distances to the block atoms.
bool any = false;
fvec4 dx, dy, dz, r2;
getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
for (int j = 0; j < 4; j++) {
include[j] = (((exclusions[i]>>j)&1) == 0 && r2[j] < cutoffDistance*cutoffDistance);
any |= include[j];
}
if (!any)
ivec4 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 switchValue(1.0f), switchDeriv(0.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
fvec4 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
fvec4 dEdR = chargeProd*inverseR*ewaldScaleFunction(r);
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
dEdR += switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
dEdR *= inverseR*inverseR;
fvec4 energy = eps*(sig6-1.0f)*sig6;
if (useSwitch) {
dEdR -= energy*switchDeriv*inverseR;
energy *= switchValue;
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
dEdR *= inverseR*inverseR;
// Accumulate energies.
if (totalEnergy) {
energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
for (int j = 0; j < 4; j++)
if (include[j])
*totalEnergy += energy[j];
energy = blend(0.0f, energy, include);
*totalEnergy += dot4(energy, 1.0f);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
transpose(result[0], result[1], result[2], result[3]);
fvec4 atomForce(forces+4*atom);
for (int j = 0; j < 4; j++) {
if (include[j]) {
blockAtomForce[j] += result[j];
atomForce -= result[j];
}
blockAtomForce[j] += result[j];
atomForce -= result[j];
}
atomForce.store(forces+4*atom);
}
......@@ -683,18 +704,14 @@ fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
float y[4];
fvec4 x1 = x*ewaldDXInv;
ivec4 index = floor(x1);
fvec4 coeff[4];
coeff[1] = x1-index;
coeff[0] = 1.0f-coeff[1];
coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
for (int i = 0; i < 4; i++) {
if (index[i] < NUM_TABLE_POINTS)
y[i] = dot4(coeff[i], fvec4(&ewaldScaleTable[4*index[i]]));
}
return fvec4(y);
ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
fvec4 coeff2 = x1-index;
fvec4 coeff1 = 1.0f-coeff2;
fvec4 t1(&ewaldScaleTable[index[0]]);
fvec4 t2(&ewaldScaleTable[index[1]]);
fvec4 t3(&ewaldScaleTable[index[2]]);
fvec4 t4(&ewaldScaleTable[index[3]]);
transpose(t1, t2, t3, t4);
return coeff1*t1 + coeff2*t2;
}
......@@ -32,6 +32,8 @@
#include "CpuPlatform.h"
#include "CpuKernelFactory.h"
#include "CpuKernels.h"
#include "CpuSETTLE.h"
#include "ReferenceConstraints.h"
#include "openmm/internal/hardware.h"
using namespace OpenMM;
......@@ -77,6 +79,12 @@ void CpuPlatform::contextCreated(ContextImpl& context, const map<string, string>
ReferencePlatform::contextCreated(context, properties);
PlatformData* data = new PlatformData(context.getSystem().getNumParticles());
contextData[&context] = data;
ReferenceConstraints& constraints = *(ReferenceConstraints*) reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData())->constraints;
if (constraints.settle != NULL) {
CpuSETTLE* parallelSettle = new CpuSETTLE(context.getSystem(), *(ReferenceSETTLEAlgorithm*) constraints.settle, data->threads);
delete constraints.settle;
constraints.settle = parallelSettle;
}
}
void CpuPlatform::contextDestroyed(ContextImpl& context) const {
......@@ -89,8 +97,7 @@ CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {
return *contextData[&context];
}
CpuPlatform::PlatformData::PlatformData(int numParticles) {
posq.resize(4*numParticles);
CpuPlatform::PlatformData::PlatformData(int numParticles) : posq(4*numParticles) {
int numThreads = threads.getNumThreads();
threadForce.resize(numThreads);
for (int i = 0; i < numThreads; i++)
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CpuSETTLE.h"
using namespace OpenMM;
using namespace std;
class CpuSETTLE::ApplyToPositionsTask : public ThreadPool::Task {
public:
ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses,
RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP),
inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
}
void execute(ThreadPool& threads, int threadIndex) {
threadSettle[threadIndex]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
}
vector<OpenMM::RealVec>& atomCoordinates;
vector<OpenMM::RealVec>& atomCoordinatesP;
vector<RealOpenMM>& inverseMasses;
RealOpenMM tolerance;
vector<ReferenceSETTLEAlgorithm*>& threadSettle;
};
class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task {
public:
ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses,
RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities),
inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
}
void execute(ThreadPool& threads, int threadIndex) {
threadSettle[threadIndex]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
}
vector<OpenMM::RealVec>& atomCoordinates;
vector<OpenMM::RealVec>& velocities;
vector<RealOpenMM>& inverseMasses;
RealOpenMM tolerance;
vector<ReferenceSETTLEAlgorithm*>& threadSettle;
};
CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) {
int numThreads = threads.getNumThreads();
int numClusters = settle.getNumClusters();
vector<RealOpenMM> mass(system.getNumParticles());
for (int i = 0; i < system.getNumParticles(); i++)
mass[i] = system.getParticleMass(i);
for (int i = 0; i < numThreads; i++) {
int start = i*numClusters/numThreads;
int end = (i+1)*numClusters/numThreads;
if (start != end) {
int numThreadClusters = end-start;
vector<int> atom1(numThreadClusters), atom2(numThreadClusters), atom3(numThreadClusters);
vector<RealOpenMM> distance1(numThreadClusters), distance2(numThreadClusters);
for (int j = 0; j < numThreadClusters; j++)
settle.getClusterParameters(start+j, atom1[j], atom2[j], atom3[j], distance1[j], distance2[j]);
threadSettle.push_back(new ReferenceSETTLEAlgorithm(atom1, atom2, atom3, distance1, distance2, mass));
}
}
}
CpuSETTLE::~CpuSETTLE() {
for (int i = 0; i < (int) threadSettle.size(); i++)
delete threadSettle[i];
}
void CpuSETTLE::apply(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
ApplyToPositionsTask task(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance, threadSettle);
threads.execute(task);
threads.waitForThreads();
}
void CpuSETTLE::applyToVelocities(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
ApplyToVelocitiesTask task(atomCoordinates, velocities, inverseMasses, tolerance, threadSettle);
threads.execute(task);
threads.waitForThreads();
}
This diff is collapsed.
......@@ -11,7 +11,6 @@ IF( INCLUDE_SERIALIZATION )
INCLUDE_DIRECTORIES(${OPENMM_DIR}/serialization/include)
SET( SHARED_OPENMM_SERIALIZATION "OpenMMSerialization" )
ENDIF( INCLUDE_SERIALIZATION )
GET_PROPERTY(COMPILE_FLAGS GLOBAL PROPERTY COMPILE_FLAGS)
# Automatically create tests using files named "Test*.cpp"
FILE(GLOB TEST_PROGS "*Test*.cpp")
......@@ -21,7 +20,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
# Link with shared library
ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET})
SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS}" LINK_FLAGS "${COMPILE_FLAGS}")
SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
ENDFOREACH(TEST_PROG ${TEST_PROGS})
......@@ -35,6 +35,7 @@
#include "openmm/internal/AssertionUtilities.h"
#include "openmm/internal/ThreadPool.h"
#include "AlignedArray.h"
#include "CpuNeighborList.h"
#include "CpuPlatform.h"
#include "sfmt/SFMT.h"
......@@ -52,7 +53,7 @@ void testNeighborList(bool periodic) {
const float boxSize[3] = {20.0f, 15.0f, 22.0f};
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> positions(4*numParticles);
AlignedArray<float> positions(4*numParticles);
for (int i = 0; i < 4*numParticles; i++)
if (i%4 < 3)
positions[i] = boxSize[i%4]*genrand_real2(sfmt);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment