Commit 047934e2 authored by Rafal P. Wiewiora's avatar Rafal P. Wiewiora
Browse files

Merge remote-tracking branch 'upstream/master'

parents ce3a5dc0 d12c9bd1
...@@ -41,14 +41,14 @@ using namespace OpenMM; ...@@ -41,14 +41,14 @@ using namespace OpenMM;
void testNeighborList() void testNeighborList()
{ {
vector<RealVec> particleList(2); vector<Vec3> particleList(2);
particleList[0] = RealVec(13.6, 0, 0); particleList[0] = Vec3(13.6, 0, 0);
particleList[1] = RealVec(0, 0, 0); particleList[1] = Vec3(0, 0, 0);
vector<set<int> > exclusions(2); vector<set<int> > exclusions(2);
NeighborList neighborList; NeighborList neighborList;
RealVec boxVectors[3]; Vec3 boxVectors[3];
computeNeighborListNaive(neighborList, 2, particleList, exclusions, boxVectors, false, 13.7, 0.01); computeNeighborListNaive(neighborList, 2, particleList, exclusions, boxVectors, false, 13.7, 0.01);
assert(neighborList.size() == 1); assert(neighborList.size() == 1);
...@@ -62,15 +62,15 @@ void testNeighborList() ...@@ -62,15 +62,15 @@ void testNeighborList()
assert(neighborList.size() == 0); assert(neighborList.size() == 0);
} }
double distance2(RealVec& pos1, RealVec& pos2, const RealVec* periodicBoxVectors) { double distance2(Vec3& pos1, Vec3& pos2, const Vec3* periodicBoxVectors) {
RealVec diff = pos1-pos2; Vec3 diff = pos1-pos2;
diff -= periodicBoxVectors[2]*floor(diff[2]/periodicBoxVectors[2][2]+0.5); diff -= periodicBoxVectors[2]*floor(diff[2]/periodicBoxVectors[2][2]+0.5);
diff -= periodicBoxVectors[1]*floor(diff[1]/periodicBoxVectors[1][1]+0.5); diff -= periodicBoxVectors[1]*floor(diff[1]/periodicBoxVectors[1][1]+0.5);
diff -= periodicBoxVectors[0]*floor(diff[0]/periodicBoxVectors[0][0]+0.5); diff -= periodicBoxVectors[0]*floor(diff[0]/periodicBoxVectors[0][0]+0.5);
return diff.dot(diff); return diff.dot(diff);
} }
void verifyNeighborList(NeighborList& list, int numParticles, vector<RealVec>& positions, const RealVec* periodicBoxVectors, double cutoff) { void verifyNeighborList(NeighborList& list, int numParticles, vector<Vec3>& positions, const Vec3* periodicBoxVectors, double cutoff) {
for (int i = 0; i < (int) list.size(); i++) { for (int i = 0; i < (int) list.size(); i++) {
int particle1 = list[i].first; int particle1 = list[i].first;
int particle2 = list[i].second; int particle2 = list[i].second;
...@@ -87,18 +87,18 @@ void verifyNeighborList(NeighborList& list, int numParticles, vector<RealVec>& p ...@@ -87,18 +87,18 @@ void verifyNeighborList(NeighborList& list, int numParticles, vector<RealVec>& p
void testPeriodic() { void testPeriodic() {
const int numParticles = 100; const int numParticles = 100;
const double cutoff = 3.0; const double cutoff = 3.0;
RealVec periodicBoxVectors[3]; Vec3 periodicBoxVectors[3];
periodicBoxVectors[0] = RealVec(20, 0, 0); periodicBoxVectors[0] = Vec3(20, 0, 0);
periodicBoxVectors[1] = RealVec(0, 15, 0); periodicBoxVectors[1] = Vec3(0, 15, 0);
periodicBoxVectors[2] = RealVec(0, 0, 22); periodicBoxVectors[2] = Vec3(0, 0, 22);
vector<RealVec> particleList(numParticles); vector<Vec3> particleList(numParticles);
OpenMM_SFMT::SFMT sfmt; OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt); init_gen_rand(0, sfmt);
for (int i = 0; i <numParticles; i++) { for (int i = 0; i <numParticles; i++) {
particleList[i][0] = (RealOpenMM) (genrand_real2(sfmt)*periodicBoxVectors[0][0]*3); particleList[i][0] = genrand_real2(sfmt)*periodicBoxVectors[0][0]*3;
particleList[i][1] = (RealOpenMM) (genrand_real2(sfmt)*periodicBoxVectors[1][1]*3); particleList[i][1] = genrand_real2(sfmt)*periodicBoxVectors[1][1]*3;
particleList[i][2] = (RealOpenMM) (genrand_real2(sfmt)*periodicBoxVectors[2][2]*3); particleList[i][2] = genrand_real2(sfmt)*periodicBoxVectors[2][2]*3;
} }
vector<set<int> > exclusions(numParticles); vector<set<int> > exclusions(numParticles);
NeighborList neighborList; NeighborList neighborList;
...@@ -111,18 +111,18 @@ void testPeriodic() { ...@@ -111,18 +111,18 @@ void testPeriodic() {
void testTriclinic() { void testTriclinic() {
const int numParticles = 1000; const int numParticles = 1000;
const double cutoff = 3.0; const double cutoff = 3.0;
RealVec periodicBoxVectors[3]; Vec3 periodicBoxVectors[3];
periodicBoxVectors[0] = RealVec(20, 0, 0); periodicBoxVectors[0] = Vec3(20, 0, 0);
periodicBoxVectors[1] = RealVec(5, 15, 0); periodicBoxVectors[1] = Vec3(5, 15, 0);
periodicBoxVectors[2] = RealVec(-3, -7, 22); periodicBoxVectors[2] = Vec3(-3, -7, 22);
vector<RealVec> particleList(numParticles); vector<Vec3> particleList(numParticles);
OpenMM_SFMT::SFMT sfmt; OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt); init_gen_rand(0, sfmt);
for (int i = 0; i <numParticles; i++) { for (int i = 0; i <numParticles; i++) {
particleList[i][0] = (RealOpenMM) (genrand_real2(sfmt)*periodicBoxVectors[0][0]*3); particleList[i][0] = genrand_real2(sfmt)*periodicBoxVectors[0][0]*3;
particleList[i][1] = (RealOpenMM) (genrand_real2(sfmt)*periodicBoxVectors[1][1]*3); particleList[i][1] = genrand_real2(sfmt)*periodicBoxVectors[1][1]*3;
particleList[i][2] = (RealOpenMM) (genrand_real2(sfmt)*periodicBoxVectors[2][2]*3); particleList[i][2] = genrand_real2(sfmt)*periodicBoxVectors[2][2]*3;
} }
vector<set<int> > exclusions(numParticles); vector<set<int> > exclusions(numParticles);
NeighborList neighborList; NeighborList neighborList;
......
...@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaAngleForceImpl::getKernelNames() { ...@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaAngleForceImpl::getKernelNames() {
void AmoebaAngleForceImpl::updateParametersInContext(ContextImpl& context) { void AmoebaAngleForceImpl::updateParametersInContext(ContextImpl& context) {
kernel.getAs<CalcAmoebaAngleForceKernel>().copyParametersToContext(context, owner); kernel.getAs<CalcAmoebaAngleForceKernel>().copyParametersToContext(context, owner);
context.systemChanged();
} }
...@@ -75,4 +75,5 @@ vector<pair<int, int> > AmoebaBondForceImpl::getBondedParticles() const { ...@@ -75,4 +75,5 @@ vector<pair<int, int> > AmoebaBondForceImpl::getBondedParticles() const {
void AmoebaBondForceImpl::updateParametersInContext(ContextImpl& context) { void AmoebaBondForceImpl::updateParametersInContext(ContextImpl& context) {
kernel.getAs<CalcAmoebaBondForceKernel>().copyParametersToContext(context, owner); kernel.getAs<CalcAmoebaBondForceKernel>().copyParametersToContext(context, owner);
context.systemChanged();
} }
...@@ -66,4 +66,5 @@ std::vector<std::string> AmoebaGeneralizedKirkwoodForceImpl::getKernelNames() { ...@@ -66,4 +66,5 @@ std::vector<std::string> AmoebaGeneralizedKirkwoodForceImpl::getKernelNames() {
void AmoebaGeneralizedKirkwoodForceImpl::updateParametersInContext(ContextImpl& context) { void AmoebaGeneralizedKirkwoodForceImpl::updateParametersInContext(ContextImpl& context) {
kernel.getAs<CalcAmoebaGeneralizedKirkwoodForceKernel>().copyParametersToContext(context, owner); kernel.getAs<CalcAmoebaGeneralizedKirkwoodForceKernel>().copyParametersToContext(context, owner);
context.systemChanged();
} }
...@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaInPlaneAngleForceImpl::getKernelNames() { ...@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaInPlaneAngleForceImpl::getKernelNames() {
void AmoebaInPlaneAngleForceImpl::updateParametersInContext(ContextImpl& context) { void AmoebaInPlaneAngleForceImpl::updateParametersInContext(ContextImpl& context) {
kernel.getAs<CalcAmoebaInPlaneAngleForceKernel>().copyParametersToContext(context, owner); kernel.getAs<CalcAmoebaInPlaneAngleForceKernel>().copyParametersToContext(context, owner);
context.systemChanged();
} }
...@@ -50,7 +50,8 @@ AmoebaMultipoleForceImpl::~AmoebaMultipoleForceImpl() { ...@@ -50,7 +50,8 @@ AmoebaMultipoleForceImpl::~AmoebaMultipoleForceImpl() {
void AmoebaMultipoleForceImpl::initialize(ContextImpl& context) { void AmoebaMultipoleForceImpl::initialize(ContextImpl& context) {
const System& system = context.getSystem(); const System& system = context.getSystem();
if (owner.getNumMultipoles() != system.getNumParticles()) int numParticles = system.getNumParticles();
if (owner.getNumMultipoles() != numParticles)
throw OpenMMException("AmoebaMultipoleForce must have exactly as many particles as the System it belongs to."); throw OpenMMException("AmoebaMultipoleForce must have exactly as many particles as the System it belongs to.");
// check cutoff < 0.5*boxSize // check cutoff < 0.5*boxSize
...@@ -64,7 +65,7 @@ void AmoebaMultipoleForceImpl::initialize(ContextImpl& context) { ...@@ -64,7 +65,7 @@ void AmoebaMultipoleForceImpl::initialize(ContextImpl& context) {
} }
double quadrupoleValidationTolerance = 1.0e-05; double quadrupoleValidationTolerance = 1.0e-05;
for (int ii = 0; ii < system.getNumParticles(); ii++) { for (int ii = 0; ii < numParticles; ii++) {
int axisType, multipoleAtomZ, multipoleAtomX, multipoleAtomY; int axisType, multipoleAtomZ, multipoleAtomX, multipoleAtomY;
double charge, thole, dampingFactor, polarity ; double charge, thole, dampingFactor, polarity ;
...@@ -121,6 +122,23 @@ void AmoebaMultipoleForceImpl::initialize(ContextImpl& context) { ...@@ -121,6 +122,23 @@ void AmoebaMultipoleForceImpl::initialize(ContextImpl& context) {
buffer << "] (ZThenX, Bisector, Z-Bisect, ThreeFold, NoAxisType) currently handled ."; buffer << "] (ZThenX, Bisector, Z-Bisect, ThreeFold, NoAxisType) currently handled .";
throw OpenMMException(buffer.str()); throw OpenMMException(buffer.str());
} }
if (axisType != AmoebaMultipoleForce::NoAxisType && (multipoleAtomZ < 0 || multipoleAtomZ >= numParticles)) {
std::stringstream buffer;
buffer << "AmoebaMultipoleForce: invalid z axis particle: " << multipoleAtomZ;
throw OpenMMException(buffer.str());
}
if (axisType != AmoebaMultipoleForce::NoAxisType && axisType != AmoebaMultipoleForce::ZOnly &&
(multipoleAtomX < 0 || multipoleAtomX >= numParticles)) {
std::stringstream buffer;
buffer << "AmoebaMultipoleForce: invalid x axis particle: " << multipoleAtomX;
throw OpenMMException(buffer.str());
}
if ((axisType == AmoebaMultipoleForce::ZBisect || axisType == AmoebaMultipoleForce::ThreeFold) &&
(multipoleAtomY < 0 || multipoleAtomY >= numParticles)) {
std::stringstream buffer;
buffer << "AmoebaMultipoleForce: invalid y axis particle: " << multipoleAtomY;
throw OpenMMException(buffer.str());
}
} }
kernel = context.getPlatform().createKernel(CalcAmoebaMultipoleForceKernel::Name(), context); kernel = context.getPlatform().createKernel(CalcAmoebaMultipoleForceKernel::Name(), context);
kernel.getAs<CalcAmoebaMultipoleForceKernel>().initialize(context.getSystem(), owner); kernel.getAs<CalcAmoebaMultipoleForceKernel>().initialize(context.getSystem(), owner);
...@@ -206,6 +224,7 @@ void AmoebaMultipoleForceImpl::getSystemMultipoleMoments(ContextImpl& context, s ...@@ -206,6 +224,7 @@ void AmoebaMultipoleForceImpl::getSystemMultipoleMoments(ContextImpl& context, s
void AmoebaMultipoleForceImpl::updateParametersInContext(ContextImpl& context) { void AmoebaMultipoleForceImpl::updateParametersInContext(ContextImpl& context) {
kernel.getAs<CalcAmoebaMultipoleForceKernel>().copyParametersToContext(context, owner); kernel.getAs<CalcAmoebaMultipoleForceKernel>().copyParametersToContext(context, owner);
context.systemChanged();
} }
void AmoebaMultipoleForceImpl::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const { void AmoebaMultipoleForceImpl::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
......
...@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaOutOfPlaneBendForceImpl::getKernelNames() { ...@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaOutOfPlaneBendForceImpl::getKernelNames() {
void AmoebaOutOfPlaneBendForceImpl::updateParametersInContext(ContextImpl& context) { void AmoebaOutOfPlaneBendForceImpl::updateParametersInContext(ContextImpl& context) {
kernel.getAs<CalcAmoebaOutOfPlaneBendForceKernel>().copyParametersToContext(context, owner); kernel.getAs<CalcAmoebaOutOfPlaneBendForceKernel>().copyParametersToContext(context, owner);
context.systemChanged();
} }
...@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaPiTorsionForceImpl::getKernelNames() { ...@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaPiTorsionForceImpl::getKernelNames() {
void AmoebaPiTorsionForceImpl::updateParametersInContext(ContextImpl& context) { void AmoebaPiTorsionForceImpl::updateParametersInContext(ContextImpl& context) {
kernel.getAs<CalcAmoebaPiTorsionForceKernel>().copyParametersToContext(context, owner); kernel.getAs<CalcAmoebaPiTorsionForceKernel>().copyParametersToContext(context, owner);
context.systemChanged();
} }
...@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaStretchBendForceImpl::getKernelNames() { ...@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaStretchBendForceImpl::getKernelNames() {
void AmoebaStretchBendForceImpl::updateParametersInContext(ContextImpl& context) { void AmoebaStretchBendForceImpl::updateParametersInContext(ContextImpl& context) {
kernel.getAs<CalcAmoebaStretchBendForceKernel>().copyParametersToContext(context, owner); kernel.getAs<CalcAmoebaStretchBendForceKernel>().copyParametersToContext(context, owner);
context.systemChanged();
} }
...@@ -21,6 +21,7 @@ SET(OPENMM_SOURCE_SUBDIRS .) ...@@ -21,6 +21,7 @@ SET(OPENMM_SOURCE_SUBDIRS .)
SET(OPENMMAMOEBACUDA_LIBRARY_NAME OpenMMAmoebaCUDA) SET(OPENMMAMOEBACUDA_LIBRARY_NAME OpenMMAmoebaCUDA)
SET(SHARED_TARGET ${OPENMMAMOEBACUDA_LIBRARY_NAME}) SET(SHARED_TARGET ${OPENMMAMOEBACUDA_LIBRARY_NAME})
SET(STATIC_TARGET ${OPENMMAMOEBACUDA_LIBRARY_NAME}_static)
# These are all the places to search for header files which are # These are all the places to search for header files which are
...@@ -85,17 +86,42 @@ ADD_CUSTOM_COMMAND(OUTPUT ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H} ...@@ -85,17 +86,42 @@ ADD_CUSTOM_COMMAND(OUTPUT ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H}
DEPENDS ${CUDA_KERNELS} DEPENDS ${CUDA_KERNELS}
) )
SET_SOURCE_FILES_PROPERTIES(${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H} PROPERTIES GENERATED TRUE) SET_SOURCE_FILES_PROPERTIES(${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H} PROPERTIES GENERATED TRUE)
ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB}) # Build the shared plugin library.
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME}CUDA)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${SHARED_AMOEBA_TARGET}) IF (OPENMM_BUILD_SHARED_LIB)
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY") ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
IF (APPLE)
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA") TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB})
ELSE (APPLE) TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME}CUDA)
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}") TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${SHARED_AMOEBA_TARGET})
ENDIF (APPLE) SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY")
IF (APPLE)
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
ELSE (APPLE)
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
ENDIF (APPLE)
INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
ENDIF (OPENMM_BUILD_SHARED_LIB)
# Build the static plugin library.
IF(OPENMM_BUILD_STATIC_LIB)
ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB})
TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME}CUDA)
TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${STATIC_AMOEBA_TARGET})
SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_STATIC_LIBRARY")
IF (APPLE)
SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
ELSE (APPLE)
SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
ENDIF (APPLE)
INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
ENDIF(OPENMM_BUILD_STATIC_LIB)
INSTALL(TARGETS ${SHARED_TARGET} DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/plugins) INSTALL(TARGETS ${SHARED_TARGET} DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/plugins)
# Ensure that links to the main CUDA library will be resolved. # Ensure that links to the main CUDA library will be resolved.
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. * * Portions copyright (c) 2008-2016 Stanford University and the Authors. *
* Authors: Mark Friedrichs, Peter Eastman * * Authors: Mark Friedrichs, Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -33,10 +33,18 @@ ...@@ -33,10 +33,18 @@
using namespace OpenMM; using namespace OpenMM;
#ifdef OPENMM_BUILDING_STATIC_LIBRARY
static void registerPlatforms() {
#else
extern "C" OPENMM_EXPORT void registerPlatforms() { extern "C" OPENMM_EXPORT void registerPlatforms() {
#endif
} }
#ifdef OPENMM_BUILDING_STATIC_LIBRARY
static void registerKernelFactories() {
#else
extern "C" OPENMM_EXPORT void registerKernelFactories() { extern "C" OPENMM_EXPORT void registerKernelFactories() {
#endif
try { try {
Platform& platform = Platform::getPlatformByName("CUDA"); Platform& platform = Platform::getPlatformByName("CUDA");
AmoebaCudaKernelFactory* factory = new AmoebaCudaKernelFactory(); AmoebaCudaKernelFactory* factory = new AmoebaCudaKernelFactory();
...@@ -105,4 +113,4 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl ...@@ -105,4 +113,4 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
return new CudaCalcAmoebaWcaDispersionForceKernel(name, platform, cu, context.getSystem()); return new CudaCalcAmoebaWcaDispersionForceKernel(name, platform, cu, context.getSystem());
throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str()); throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
} }
\ No newline at end of file
...@@ -41,7 +41,7 @@ ...@@ -41,7 +41,7 @@
#include "CudaForceInfo.h" #include "CudaForceInfo.h"
#include "CudaKernelSources.h" #include "CudaKernelSources.h"
#include "CudaNonbondedUtilities.h" #include "CudaNonbondedUtilities.h"
#include "jama_svd.h" #include "jama_lu.h"
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
...@@ -52,10 +52,10 @@ ...@@ -52,10 +52,10 @@
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
#define CHECK_RESULT(result) \ #define CHECK_RESULT(result, prefix) \
if (result != CUDA_SUCCESS) { \ if (result != CUDA_SUCCESS) { \
std::stringstream m; \ std::stringstream m; \
m<<errorMessage<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \ m<<prefix<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\ throw OpenMMException(m.str());\
} }
...@@ -813,7 +813,7 @@ private: ...@@ -813,7 +813,7 @@ private:
}; };
CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) :
CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false), CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false), hasCreatedEvent(false),
multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL), sphericalDipoles(NULL), sphericalQuadrupoles(NULL), multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL), sphericalDipoles(NULL), sphericalQuadrupoles(NULL),
fracDipoles(NULL), fracQuadrupoles(NULL), field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), inducedDipole(NULL), fracDipoles(NULL), fracQuadrupoles(NULL), field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), inducedDipole(NULL),
diisCoefficients(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), prevDipoles(NULL), prevDipolesPolar(NULL), prevDipolesGk(NULL), diisCoefficients(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), prevDipoles(NULL), prevDipolesPolar(NULL), prevDipolesGk(NULL),
...@@ -822,7 +822,7 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri ...@@ -822,7 +822,7 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri
inducedDipoleFieldGradientGk(NULL), inducedDipoleFieldGradientGkPolar(NULL), extrapolatedDipoleFieldGradient(NULL), extrapolatedDipoleFieldGradientPolar(NULL), inducedDipoleFieldGradientGk(NULL), inducedDipoleFieldGradientGkPolar(NULL), extrapolatedDipoleFieldGradient(NULL), extrapolatedDipoleFieldGradientPolar(NULL),
extrapolatedDipoleFieldGradientGk(NULL), extrapolatedDipoleFieldGradientGkPolar(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL), extrapolatedDipoleFieldGradientGk(NULL), extrapolatedDipoleFieldGradientGkPolar(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL), pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL),
pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeCphi(NULL), pmeAtomGridIndex(NULL), lastPositions(NULL), sort(NULL), gkKernel(NULL) { pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeCphi(NULL), lastPositions(NULL), sort(NULL), gkKernel(NULL) {
} }
CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() { CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
...@@ -927,14 +927,14 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() { ...@@ -927,14 +927,14 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
delete pmePhidp; delete pmePhidp;
if (pmeCphi != NULL) if (pmeCphi != NULL)
delete pmeCphi; delete pmeCphi;
if (pmeAtomGridIndex != NULL)
delete pmeAtomGridIndex;
if (lastPositions != NULL) if (lastPositions != NULL)
delete lastPositions; delete lastPositions;
if (sort != NULL) if (sort != NULL)
delete sort; delete sort;
if (hasInitializedFFT) if (hasInitializedFFT)
cufftDestroy(fft); cufftDestroy(fft);
if (hasCreatedEvent)
cuEventDestroy(syncEvent);
} }
void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const AmoebaMultipoleForce& force) { void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const AmoebaMultipoleForce& force) {
...@@ -1021,6 +1021,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1021,6 +1021,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
prevErrors = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevErrors"); prevErrors = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevErrors");
diisMatrix = new CudaArray(cu, MaxPrevDIISDipoles*MaxPrevDIISDipoles, elementSize, "diisMatrix"); diisMatrix = new CudaArray(cu, MaxPrevDIISDipoles*MaxPrevDIISDipoles, elementSize, "diisMatrix");
diisCoefficients = new CudaArray(cu, MaxPrevDIISDipoles+1, sizeof(float), "diisMatrix"); diisCoefficients = new CudaArray(cu, MaxPrevDIISDipoles+1, sizeof(float), "diisMatrix");
CHECK_RESULT(cuEventCreate(&syncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for AmoebaMultipoleForce");
hasCreatedEvent = true;
} }
else if (polarizationType == AmoebaMultipoleForce::Extrapolated) { else if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
int numOrders = force.getExtrapolationCoefficients().size(); int numOrders = force.getExtrapolationCoefficients().size();
...@@ -1153,7 +1155,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1153,7 +1155,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
NonbondedForce nb; NonbondedForce nb;
nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance()); nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
nb.setCutoffDistance(force.getCutoffDistance()); nb.setCutoffDistance(force.getCutoffDistance());
NonbondedForceImpl::calcPMEParameters(system, nb, alpha, gridSizeX, gridSizeY, gridSizeZ); NonbondedForceImpl::calcPMEParameters(system, nb, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX); gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY); gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ); gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ);
...@@ -1212,6 +1214,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1212,6 +1214,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldByDIIS"); updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldByDIIS");
recordDIISDipolesKernel = cu.getKernel(module, "recordInducedDipolesForDIIS"); recordDIISDipolesKernel = cu.getKernel(module, "recordInducedDipolesForDIIS");
buildMatrixKernel = cu.getKernel(module, "computeDIISMatrix"); buildMatrixKernel = cu.getKernel(module, "computeDIISMatrix");
solveMatrixKernel = cu.getKernel(module, "solveDIISMatrix");
initExtrapolatedKernel = cu.getKernel(module, "initExtrapolatedDipoles"); initExtrapolatedKernel = cu.getKernel(module, "initExtrapolatedDipoles");
iterateExtrapolatedKernel = cu.getKernel(module, "iterateExtrapolatedDipoles"); iterateExtrapolatedKernel = cu.getKernel(module, "iterateExtrapolatedDipoles");
computeExtrapolatedKernel = cu.getKernel(module, "computeExtrapolatedDipoles"); computeExtrapolatedKernel = cu.getKernel(module, "computeExtrapolatedDipoles");
...@@ -1253,7 +1256,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1253,7 +1256,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
else if (polarizationType == AmoebaMultipoleForce::Extrapolated) else if (polarizationType == AmoebaMultipoleForce::Extrapolated)
pmeDefines["EXTRAPOLATED_POLARIZATION"] = ""; pmeDefines["EXTRAPOLATED_POLARIZATION"] = "";
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines); CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines);
pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeTransformMultipolesKernel = cu.getKernel(module, "transformMultipolesToFractionalCoordinates"); pmeTransformMultipolesKernel = cu.getKernel(module, "transformMultipolesToFractionalCoordinates");
pmeTransformPotentialKernel = cu.getKernel(module, "transformPotentialToCartesianCoordinates"); pmeTransformPotentialKernel = cu.getKernel(module, "transformPotentialToCartesianCoordinates");
pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles"); pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles");
...@@ -1285,7 +1287,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1285,7 +1287,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
pmePhidp = new CudaArray(cu, 20*numMultipoles, elementSize, "pmePhidp"); pmePhidp = new CudaArray(cu, 20*numMultipoles, elementSize, "pmePhidp");
pmeCphi = new CudaArray(cu, 10*numMultipoles, elementSize, "pmeCphi"); pmeCphi = new CudaArray(cu, 10*numMultipoles, elementSize, "pmeCphi");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numMultipoles, "pmeAtomGridIndex");
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms()); sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
cufftResult result = cufftPlan3d(&fft, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2Z : CUFFT_C2C); cufftResult result = cufftPlan3d(&fft, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2Z : CUFFT_C2C);
if (result != CUFFT_SUCCESS) if (result != CUFFT_SUCCESS)
...@@ -1569,16 +1570,11 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1569,16 +1570,11 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
// Reciprocal space calculation. // Reciprocal space calculation.
unsigned int maxTiles = nb.getInteractingTiles().getSize(); unsigned int maxTiles = nb.getInteractingTiles().getSize();
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms(), cu.ThreadBlockSize, cu.ThreadBlockSize*PmeOrder*PmeOrder*elementSize);
sort->sort(*pmeAtomGridIndex);
void* pmeTransformMultipolesArgs[] = {&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), void* pmeTransformMultipolesArgs[] = {&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
&fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeTransformMultipolesKernel, pmeTransformMultipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeTransformMultipolesKernel, pmeTransformMultipolesArgs, cu.getNumAtoms());
void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), &pmeGrid->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms());
void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()}; void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()};
...@@ -1590,7 +1586,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1590,7 +1586,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(),
&pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms()); cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
else else
...@@ -1598,7 +1594,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1598,7 +1594,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
void* pmeFixedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhi->getDevicePointer(), &field->getDevicePointer(), void* pmeFixedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhi->getDevicePointer(), &field->getDevicePointer(),
&fieldPolar ->getDevicePointer(), &cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &fieldPolar ->getDevicePointer(), &cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(),
cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms()); cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms());
void* pmeTransformFixedPotentialArgs[] = {&pmePhi->getDevicePointer(), &pmeCphi->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; void* pmeTransformFixedPotentialArgs[] = {&pmePhi->getDevicePointer(), &pmeCphi->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeTransformPotentialKernel, pmeTransformFixedPotentialArgs, cu.getNumAtoms()); cu.executeKernel(pmeTransformPotentialKernel, pmeTransformFixedPotentialArgs, cu.getNumAtoms());
...@@ -1625,7 +1621,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1625,7 +1621,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu.clearBuffer(*pmeGrid); cu.clearBuffer(*pmeGrid);
void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), &pmeGrid->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
...@@ -1634,15 +1630,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1634,15 +1630,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
else else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms()); cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
else else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(), void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
&pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), &pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(),
cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
&pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms()); cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
// Iterate until the dipoles converge. // Iterate until the dipoles converge.
...@@ -1771,7 +1766,7 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect ...@@ -1771,7 +1766,7 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
cu.executeKernel(computeInducedFieldKernel, &computeInducedFieldArgs[0], numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads); cu.executeKernel(computeInducedFieldKernel, &computeInducedFieldArgs[0], numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
cu.clearBuffer(*pmeGrid); cu.clearBuffer(*pmeGrid);
void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), &pmeGrid->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision()) {
...@@ -1784,15 +1779,14 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect ...@@ -1784,15 +1779,14 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(),
&pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms()); cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
else else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(), void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
&pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), &pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(),
cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
&pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms()); cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
if (polarizationType == AmoebaMultipoleForce::Extrapolated) { if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(), void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
...@@ -1831,22 +1825,24 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) { ...@@ -1831,22 +1825,24 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
cu.executeKernel(recordDIISDipolesKernel, recordDIISDipolesArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2); cu.executeKernel(recordDIISDipolesKernel, recordDIISDipolesArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
float2* errors = (float2*) cu.getPinnedBuffer(); float2* errors = (float2*) cu.getPinnedBuffer();
inducedDipoleErrors->download(errors, false); inducedDipoleErrors->download(errors, false);
cuEventRecord(syncEvent, cu.getCurrentStream());
// Build the DIIS matrix. // Build the DIIS matrix.
int numPrev = (iteration+1 < MaxPrevDIISDipoles ? iteration+1 : MaxPrevDIISDipoles); int numPrev = (iteration+1 < MaxPrevDIISDipoles ? iteration+1 : MaxPrevDIISDipoles);
void* buildMatrixArgs[] = {&prevErrors->getDevicePointer(), &iteration, &diisMatrix->getDevicePointer()}; void* buildMatrixArgs[] = {&prevErrors->getDevicePointer(), &iteration, &diisMatrix->getDevicePointer()};
int threadBlocks = min(numPrev, cu.getNumThreadBlocks()); int threadBlocks = min(numPrev, cu.getNumThreadBlocks());
cu.executeKernel(buildMatrixKernel, buildMatrixArgs, threadBlocks*128, 128, 128*elementSize); int blockSize = 512;
vector<float> matrixf; cu.executeKernel(buildMatrixKernel, buildMatrixArgs, threadBlocks*blockSize, blockSize, blockSize*elementSize);
vector<double> matrix;
if (cu.getUseDoublePrecision()) // Solve the matrix.
diisMatrix->download(matrix);
else void* solveMatrixArgs[] = {&iteration, &diisMatrix->getDevicePointer(), &diisCoefficients->getDevicePointer()};
diisMatrix->download(matrixf); cu.executeKernel(solveMatrixKernel, solveMatrixArgs, 32, 32);
// Determine whether the iteration has converged. // Determine whether the iteration has converged.
cuEventSynchronize(syncEvent);
double total1 = 0.0, total2 = 0.0; double total1 = 0.0, total2 = 0.0;
for (int j = 0; j < inducedDipoleErrors->getSize(); j++) { for (int j = 0; j < inducedDipoleErrors->getSize(); j++) {
total1 += errors[j].x; total1 += errors[j].x;
...@@ -1854,56 +1850,16 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) { ...@@ -1854,56 +1850,16 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
} }
if (48.033324*sqrt(max(total1, total2)/cu.getNumAtoms()) < inducedEpsilon) if (48.033324*sqrt(max(total1, total2)/cu.getNumAtoms()) < inducedEpsilon)
return true; return true;
// Compute the coefficients for selecting the new dipoles.
float* coefficients = (float*) cu.getPinnedBuffer();
if (iteration == 0)
coefficients[0] = 1;
else {
int rank = numPrev+1;
Array2D<double> b(rank, rank);
b[0][0] = 0;
for (int i = 1; i < rank; i++)
b[i][0] = b[0][i] = -1;
if (cu.getUseDoublePrecision()) {
for (int i = 0; i < numPrev; i++)
for (int j = 0; j < numPrev; j++)
b[i+1][j+1] = matrix[i*MaxPrevDIISDipoles+j];
}
else {
for (int i = 0; i < numPrev; i++)
for (int j = 0; j < numPrev; j++)
b[i+1][j+1] = matrixf[i*MaxPrevDIISDipoles+j];
}
// Solve using SVD. Since the right hand side is (-1, 0, 0, 0, ...), this is simpler than the general case.
JAMA::SVD<double> svd(b);
Array2D<double> u, v;
svd.getU(u);
svd.getV(v);
Array1D<double> s;
svd.getSingularValues(s);
int effectiveRank = svd.rank();
for (int i = 1; i < rank; i++) {
double d = 0;
for (int j = 0; j < effectiveRank; j++)
d -= u[0][j]*v[i][j]/s[j];
coefficients[i-1] = d;
}
}
diisCoefficients->upload(coefficients, false);
// Compute the dipoles. // Compute the dipoles.
void* updateInducedFieldArgs[] = {&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), void* updateInducedFieldArgs[] = {&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
&prevDipoles->getDevicePointer(), &prevDipolesPolar->getDevicePointer(), &diisCoefficients->getDevicePointer(), &numPrev}; &prevDipoles->getDevicePointer(), &prevDipolesPolar->getDevicePointer(), &diisCoefficients->getDevicePointer(), &numPrev};
cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize); cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, 3*cu.getNumAtoms(), 256);
if (gkKernel != NULL) { if (gkKernel != NULL) {
void* updateInducedFieldGkArgs[] = {&gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(), void* updateInducedFieldGkArgs[] = {&gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
&prevDipolesGk->getDevicePointer(), &prevDipolesGkPolar->getDevicePointer(), &diisCoefficients->getDevicePointer(), &numPrev}; &prevDipolesGk->getDevicePointer(), &prevDipolesGkPolar->getDevicePointer(), &diisCoefficients->getDevicePointer(), &numPrev};
cu.executeKernel(updateInducedFieldKernel, updateInducedFieldGkArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize); cu.executeKernel(updateInducedFieldKernel, updateInducedFieldGkArgs, 3*cu.getNumAtoms(), 256);
} }
return false; return false;
} }
......
...@@ -408,7 +408,7 @@ private: ...@@ -408,7 +408,7 @@ private:
int fixedFieldThreads, inducedFieldThreads, electrostaticsThreads; int fixedFieldThreads, inducedFieldThreads, electrostaticsThreads;
int gridSizeX, gridSizeY, gridSizeZ; int gridSizeX, gridSizeY, gridSizeZ;
double alpha, inducedEpsilon; double alpha, inducedEpsilon;
bool usePME, hasQuadrupoles, hasInitializedScaleFactors, hasInitializedFFT, multipolesAreValid; bool usePME, hasQuadrupoles, hasInitializedScaleFactors, hasInitializedFFT, multipolesAreValid, hasCreatedEvent;
AmoebaMultipoleForce::PolarizationType polarizationType; AmoebaMultipoleForce::PolarizationType polarizationType;
CudaContext& cu; CudaContext& cu;
const System& system; const System& system;
...@@ -465,16 +465,16 @@ private: ...@@ -465,16 +465,16 @@ private:
CudaArray* pmePhidp; CudaArray* pmePhidp;
CudaArray* pmeCphi; CudaArray* pmeCphi;
CudaArray* pmeAtomRange; CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex;
CudaArray* lastPositions; CudaArray* lastPositions;
CudaSort* sort; CudaSort* sort;
cufftHandle fft; cufftHandle fft;
CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel; CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel;
CUfunction pmeGridIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel; CUfunction pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel; CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
CUfunction recordDIISDipolesKernel, buildMatrixKernel; CUfunction recordDIISDipolesKernel, buildMatrixKernel, solveMatrixKernel;
CUfunction initExtrapolatedKernel, iterateExtrapolatedKernel, computeExtrapolatedKernel, addExtrapolatedGradientKernel; CUfunction initExtrapolatedKernel, iterateExtrapolatedKernel, computeExtrapolatedKernel, addExtrapolatedGradientKernel;
CUfunction pmeTransformMultipolesKernel, pmeTransformPotentialKernel; CUfunction pmeTransformMultipolesKernel, pmeTransformPotentialKernel;
CUevent syncEvent;
CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel; CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel;
static const int PmeOrder = 5; static const int PmeOrder = 5;
static const int MaxPrevDIISDipoles = 20; static const int MaxPrevDIISDipoles = 20;
......
...@@ -107,9 +107,9 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has ...@@ -107,9 +107,9 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
real dmp = atom1.damp*atom2.damp; real dmp = atom1.damp*atom2.damp;
real a = min(atom1.thole, atom2.thole); real a = min(atom1.thole, atom2.thole);
real u = fabs(dmp) > 1.0e-5f ? r/dmp : 1e10f; real u = r/dmp;
real au3 = a*u*u*u; real au3 = fabs(dmp) > 1.0e-5f ? a*u*u*u : 0;
real expau3 = au3 < 50 ? EXP(-au3) : 0; real expau3 = fabs(dmp) > 1.0e-5f ? EXP(-au3) : 0;
real a2u6 = au3*au3; real a2u6 = au3*au3;
real a3u9 = a2u6*au3; real a3u9 = a2u6*au3;
// Thole damping factors for energies // Thole damping factors for energies
...@@ -336,7 +336,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has ...@@ -336,7 +336,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
iEIY -= eCoef*(qiUinpI.y*qiUindJ.x + qiUindI.y*qiUinpJ.x); iEIY -= eCoef*(qiUinpI.y*qiUindJ.x + qiUindI.y*qiUinpJ.x);
iEJY -= eCoef*(qiUinpJ.y*qiUindI.x + qiUindJ.y*qiUinpI.x); iEJY -= eCoef*(qiUinpJ.y*qiUindI.x + qiUindJ.y*qiUinpI.x);
fIZ += dCoef*(qiUinpI.x*qiUindJ.x + qiUindI.x*qiUinpJ.x); fIZ += dCoef*(qiUinpI.x*qiUindJ.x + qiUindI.x*qiUinpJ.x);
fIZ += dCoef*(qiUinpJ.x*qiUindI.x + qiUindJ.x*qiUinpI.x); fJZ += dCoef*(qiUinpJ.x*qiUindI.x + qiUindJ.x*qiUinpI.x);
// Uind-Uind terms (m=1) // Uind-Uind terms (m=1)
eCoef = 2*rInvVec[3]*thole_d1; eCoef = 2*rInvVec[3]*thole_d1;
dCoef = -3*rInvVec[4]*dthole_d1; dCoef = -3*rInvVec[4]*dthole_d1;
...@@ -345,7 +345,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has ...@@ -345,7 +345,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
iEIY += eCoef*(qiUinpI.x*qiUindJ.y + qiUindI.x*qiUinpJ.y); iEIY += eCoef*(qiUinpI.x*qiUindJ.y + qiUindI.x*qiUinpJ.y);
iEJY += eCoef*(qiUinpJ.x*qiUindI.y + qiUindJ.x*qiUinpI.y); iEJY += eCoef*(qiUinpJ.x*qiUindI.y + qiUindJ.x*qiUinpI.y);
fIZ += dCoef*(qiUinpI.y*qiUindJ.y + qiUindI.y*qiUinpJ.y + qiUinpI.z*qiUindJ.z + qiUindI.z*qiUinpJ.z); fIZ += dCoef*(qiUinpI.y*qiUindJ.y + qiUindI.y*qiUinpJ.y + qiUinpI.z*qiUindJ.z + qiUindI.z*qiUinpJ.z);
fIZ += dCoef*(qiUinpJ.y*qiUindI.y + qiUindJ.y*qiUinpI.y + qiUinpJ.z*qiUindI.z + qiUindJ.z*qiUinpI.z); fJZ += dCoef*(qiUinpJ.y*qiUindI.y + qiUindJ.y*qiUinpI.y + qiUinpJ.z*qiUindI.z + qiUindJ.z*qiUinpI.z);
#endif #endif
// The quasi-internal frame forces and torques. Note that the induced torque intermediates are // The quasi-internal frame forces and torques. Note that the induced torque intermediates are
...@@ -545,7 +545,7 @@ extern "C" __global__ void computeElectrostatics( ...@@ -545,7 +545,7 @@ extern "C" __global__ void computeElectrostatics(
data.force = make_real3(0); data.force = make_real3(0);
data.torque = make_real3(0); data.torque = make_real3(0);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx); unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else #else
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
......
...@@ -655,7 +655,7 @@ extern "C" __global__ void computeFixedField( ...@@ -655,7 +655,7 @@ extern "C" __global__ void computeFixedField(
data.bornRadius = bornRadii[atom1]; data.bornRadius = bornRadii[atom1];
#endif #endif
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx); unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else #else
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
......
...@@ -514,7 +514,7 @@ extern "C" __global__ void computeInducedField( ...@@ -514,7 +514,7 @@ extern "C" __global__ void computeInducedField(
loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole); loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
#endif #endif
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx); unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else #else
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
...@@ -607,11 +607,6 @@ extern "C" __global__ void recordInducedDipolesForDIIS(const long long* __restri ...@@ -607,11 +607,6 @@ extern "C" __global__ void recordInducedDipolesForDIIS(const long long* __restri
const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float* __restrict__ polarizability, float2* __restrict__ errors, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float* __restrict__ polarizability, float2* __restrict__ errors,
real* __restrict__ prevDipoles, real* __restrict__ prevDipolesPolar, real* __restrict__ prevErrors, int iteration, bool recordPrevErrors, real* __restrict__ matrix) { real* __restrict__ prevDipoles, real* __restrict__ prevDipolesPolar, real* __restrict__ prevErrors, int iteration, bool recordPrevErrors, real* __restrict__ matrix) {
extern __shared__ real2 buffer[]; extern __shared__ real2 buffer[];
#ifdef USE_EWALD
const real ewaldScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI;
#else
const real ewaldScale = 0;
#endif
const real fieldScale = 1/(real) 0x100000000; const real fieldScale = 1/(real) 0x100000000;
real sumErrors = 0; real sumErrors = 0;
real sumPolarErrors = 0; real sumPolarErrors = 0;
...@@ -699,6 +694,126 @@ extern "C" __global__ void computeDIISMatrix(real* __restrict__ prevErrors, int ...@@ -699,6 +694,126 @@ extern "C" __global__ void computeDIISMatrix(real* __restrict__ prevErrors, int
} }
} }
extern "C" __global__ void solveDIISMatrix(int iteration, const real* __restrict__ matrix, float* __restrict__ coefficients) {
__shared__ real b[MAX_PREV_DIIS_DIPOLES+1][MAX_PREV_DIIS_DIPOLES+1];
__shared__ real piv[MAX_PREV_DIIS_DIPOLES+1];
__shared__ real x[MAX_PREV_DIIS_DIPOLES+1];
// On the first iteration we don't need to do any calculation.
if (iteration == 0) {
if (threadIdx.x == 0)
coefficients[0] = 1;
return;
}
// Load the matrix.
int numPrev = min(iteration+1, MAX_PREV_DIIS_DIPOLES);
int rank = numPrev+1;
for (int index = threadIdx.x; index < numPrev*numPrev; index += blockDim.x) {
int i = index/numPrev;
int j = index-i*numPrev;
b[i+1][j+1] = matrix[i*MAX_PREV_DIIS_DIPOLES+j];
}
for (int i = threadIdx.x; i < rank; i += blockDim.x) {
b[i][0] = -1;
piv[i] = i;
}
__syncthreads();
// Compute the mean absolute value of the values we just loaded. We use that for preconditioning it,
// which is essential for doing the computation in single precision.
if (threadIdx.x == 0) {
real mean = 0;
for (int i = 0; i < numPrev; i++)
for (int j = 0; j < numPrev; j++)
mean += fabs(b[i+1][j+1]);
mean /= numPrev*numPrev;
b[0][0] = 0;
for (int i = 1; i < rank; i++)
b[0][i] = -mean;
// Compute the LU decomposition of the matrix. This code is adapted from JAMA.
int pivsign = 1;
for (int j = 0; j < rank; j++) {
// Apply previous transformations.
for (int i = 0; i < rank; i++) {
// Most of the time is spent in the following dot product.
int kmax = min(i, j);
real s = 0;
for (int k = 0; k < kmax; k++)
s += b[i][k] * b[k][j];
b[i][j] -= s;
}
// Find pivot and exchange if necessary.
int p = j;
for (int i = j+1; i < rank; i++)
if (abs(b[i][j]) > abs(b[p][j]))
p = i;
if (p != j) {
int k = 0;
for (k = 0; k < rank; k++) {
real t = b[p][k];
b[p][k] = b[j][k];
b[j][k] = t;
}
k = piv[p];
piv[p] = piv[j];
piv[j] = k;
pivsign = -pivsign;
}
// Compute multipliers.
if ((j < rank) && (b[j][j] != 0))
for (int i = j+1; i < rank; i++)
b[i][j] /= b[j][j];
}
for (int i = 0; i < rank; i++)
if (b[i][i] == 0) {
// The matrix is singular.
for (int j = 0; j < rank-1; j++)
coefficients[j] = 0;
coefficients[rank-1] = 1;
return;
}
// Solve b*Y = X(piv)
for (int i = 0; i < rank; i++)
x[i] = (piv[i] == 0 ? -1 : 0);
for (int k = 0; k < rank; k++)
for (int i = k+1; i < rank; i++)
x[i] -= x[k] * b[i][k];
// Solve U*X = Y;
for (int k = rank-1; k >= 0; k--) {
x[k] /= b[k][k];
for (int i = 0; i < k; i++)
x[i] -= x[k] * b[i][k];
}
// Record the coefficients.
real lastCoeff = 1;
for (int i = 0; i < rank-1; i++) {
real c = x[i+1]*mean;
coefficients[i] = c;
lastCoeff -= c;
}
coefficients[rank-1] = lastCoeff;
}
}
extern "C" __global__ void updateInducedFieldByDIIS(real* __restrict__ inducedDipole, real* __restrict__ inducedDipolePolar, extern "C" __global__ void updateInducedFieldByDIIS(real* __restrict__ inducedDipole, real* __restrict__ inducedDipolePolar,
const real* __restrict__ prevDipoles, const real* __restrict__ prevDipolesPolar, const float* __restrict__ coefficients, int numPrev) { const real* __restrict__ prevDipoles, const real* __restrict__ prevDipolesPolar, const float* __restrict__ coefficients, int numPrev) {
for (int index = blockIdx.x*blockDim.x + threadIdx.x; index < 3*NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x + threadIdx.x; index < 3*NUM_ATOMS; index += blockDim.x*gridDim.x) {
......
...@@ -69,47 +69,6 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) { ...@@ -69,47 +69,6 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
thetai[i-1] = make_real4(ARRAY(PME_ORDER,i), ARRAY(PME_ORDER-1,i), ARRAY(PME_ORDER-2,i), ARRAY(PME_ORDER-3,i)); thetai[i-1] = make_real4(ARRAY(PME_ORDER,i), ARRAY(PME_ORDER-1,i), ARRAY(PME_ORDER-2,i), ARRAY(PME_ORDER-3,i));
} }
/**
* Compute the index of the grid point each atom is associated with.
*/
extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 pos = posq[i];
pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
pos -= periodicBoxVecX*floor(pos.x*recipBoxVecX.z+0.5f);
// First axis.
real w = pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr;
int igrid1 = ifr-PME_ORDER+1;
// Second axis.
w = pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
int igrid2 = ifr-PME_ORDER+1;
// Third axis.
w = pos.z*recipBoxVecZ.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
int igrid3 = ifr-PME_ORDER+1;
// Record the grid point.
igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
pmeAtomGridIndex[i] = make_int2(i, igrid1*GRID_SIZE_Y*GRID_SIZE_Z+igrid2*GRID_SIZE_Z+igrid3);
}
}
/** /**
* Convert the fixed multipoles from Cartesian to fractional coordinates. * Convert the fixed multipoles from Cartesian to fractional coordinates.
*/ */
...@@ -209,7 +168,7 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real* ...@@ -209,7 +168,7 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real*
} }
extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ fracDipole, extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ fracDipole,
const real* __restrict__ fracQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, const real* __restrict__ fracQuadrupole, real2* __restrict__ pmeGrid,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) { real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
#if __CUDA_ARCH__ < 500 #if __CUDA_ARCH__ < 500
real array[PME_ORDER*PME_ORDER]; real array[PME_ORDER*PME_ORDER];
...@@ -300,7 +259,7 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p ...@@ -300,7 +259,7 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
} }
extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole, extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) { real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
#if __CUDA_ARCH__ < 500 #if __CUDA_ARCH__ < 500
real array[PME_ORDER*PME_ORDER]; real array[PME_ORDER*PME_ORDER];
...@@ -451,7 +410,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co ...@@ -451,7 +410,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phi, extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phi,
long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq, long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
const real* __restrict__ labFrameDipole, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, const real* __restrict__ labFrameDipole, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, int2* __restrict__ pmeAtomGridIndex) { real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
#if __CUDA_ARCH__ < 500 #if __CUDA_ARCH__ < 500
real array[PME_ORDER*PME_ORDER]; real array[PME_ORDER*PME_ORDER];
#else #else
...@@ -476,11 +435,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -476,11 +435,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
} }
__syncthreads(); __syncthreads();
// Process the atoms in spatially sorted order. This improves cache performance when loading for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
// the grid values.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[i].x;
real4 pos = posq[m]; real4 pos = posq[m];
pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f); pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f); pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
...@@ -533,9 +488,9 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -533,9 +488,9 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
real tuv102 = 0; real tuv102 = 0;
real tuv012 = 0; real tuv012 = 0;
real tuv111 = 0; real tuv111 = 0;
for (int iz = 0; iz < PME_ORDER; iz++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
real4 v = theta3[iz]; real4 v = theta1[ix];
real tu00 = 0; real tu00 = 0;
real tu10 = 0; real tu10 = 0;
real tu01 = 0; real tu01 = 0;
...@@ -550,47 +505,47 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -550,47 +505,47 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
int j = igrid2+iy-(igrid2+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); int j = igrid2+iy-(igrid2+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
real4 u = theta2[iy]; real4 u = theta2[iy];
real4 t = make_real4(0, 0, 0, 0); real4 t = make_real4(0, 0, 0, 0);
for (int ix = 0; ix < PME_ORDER; ix++) { for (int iz = 0; iz < PME_ORDER; iz++) {
int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0); int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k; int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
real tq = pmeGrid[gridIndex].x; real tq = pmeGrid[gridIndex].x;
real4 tadd = theta1[ix]; real4 tadd = theta3[iz];
t.x += tq*tadd.x; t.x += tq*tadd.x;
t.y += tq*tadd.y; t.y += tq*tadd.y;
t.z += tq*tadd.z; t.z += tq*tadd.z;
t.w += tq*tadd.w; t.w += tq*tadd.w;
} }
tu00 += t.x*u.x; tu00 += u.x*t.x;
tu10 += t.y*u.x; tu10 += u.y*t.x;
tu01 += t.x*u.y; tu01 += u.x*t.y;
tu20 += t.z*u.x; tu20 += u.z*t.x;
tu11 += t.y*u.y; tu11 += u.y*t.y;
tu02 += t.x*u.z; tu02 += u.x*t.z;
tu30 += t.w*u.x; tu30 += u.w*t.x;
tu21 += t.z*u.y; tu21 += u.z*t.y;
tu12 += t.y*u.z; tu12 += u.y*t.z;
tu03 += t.x*u.w; tu03 += u.x*t.w;
} }
tuv000 += tu00*v.x; tuv000 += v.x*tu00;
tuv100 += tu10*v.x; tuv100 += v.y*tu00;
tuv010 += tu01*v.x; tuv010 += v.x*tu10;
tuv001 += tu00*v.y; tuv001 += v.x*tu01;
tuv200 += tu20*v.x; tuv200 += v.z*tu00;
tuv020 += tu02*v.x; tuv020 += v.x*tu20;
tuv002 += tu00*v.z; tuv002 += v.x*tu02;
tuv110 += tu11*v.x; tuv110 += v.y*tu10;
tuv101 += tu10*v.y; tuv101 += v.y*tu01;
tuv011 += tu01*v.y; tuv011 += v.x*tu11;
tuv300 += tu30*v.x; tuv300 += v.w*tu00;
tuv030 += tu03*v.x; tuv030 += v.x*tu30;
tuv003 += tu00*v.w; tuv003 += v.x*tu03;
tuv210 += tu21*v.x; tuv210 += v.z*tu10;
tuv201 += tu20*v.y; tuv201 += v.z*tu01;
tuv120 += tu12*v.x; tuv120 += v.y*tu20;
tuv021 += tu02*v.y; tuv021 += v.x*tu21;
tuv102 += tu10*v.z; tuv102 += v.y*tu02;
tuv012 += tu01*v.z; tuv012 += v.x*tu12;
tuv111 += tu11*v.y; tuv111 += v.y*tu11;
} }
phi[m] = tuv000; phi[m] = tuv000;
phi[m+NUM_ATOMS] = tuv100; phi[m+NUM_ATOMS] = tuv100;
...@@ -628,7 +583,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -628,7 +583,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phid, extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phid,
real* __restrict__ phip, real* __restrict__ phidp, const real4* __restrict__ posq, real* __restrict__ phip, real* __restrict__ phidp, const real4* __restrict__ posq,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX,
real3 recipBoxVecY, real3 recipBoxVecZ, int2* __restrict__ pmeAtomGridIndex) { real3 recipBoxVecY, real3 recipBoxVecZ) {
#if __CUDA_ARCH__ < 500 #if __CUDA_ARCH__ < 500
real array[PME_ORDER*PME_ORDER]; real array[PME_ORDER*PME_ORDER];
#else #else
...@@ -640,11 +595,7 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri ...@@ -640,11 +595,7 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real4 theta2[PME_ORDER]; real4 theta2[PME_ORDER];
real4 theta3[PME_ORDER]; real4 theta3[PME_ORDER];
// Process the atoms in spatially sorted order. This improves cache performance when loading for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
// the grid values.
for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[atom].x;
real4 pos = posq[m]; real4 pos = posq[m];
pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f); pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f); pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
...@@ -715,9 +666,9 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri ...@@ -715,9 +666,9 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real tuv102 = 0; real tuv102 = 0;
real tuv012 = 0; real tuv012 = 0;
real tuv111 = 0; real tuv111 = 0;
for (int iz = 0; iz < PME_ORDER; iz++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
real4 v = theta3[iz]; real4 v = theta1[ix];
real tu00_1 = 0; real tu00_1 = 0;
real tu01_1 = 0; real tu01_1 = 0;
real tu10_1 = 0; real tu10_1 = 0;
...@@ -750,11 +701,11 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri ...@@ -750,11 +701,11 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real t1_2 = 0; real t1_2 = 0;
real t2_2 = 0; real t2_2 = 0;
real t3 = 0; real t3 = 0;
for (int ix = 0; ix < PME_ORDER; ix++) { for (int iz = 0; iz < PME_ORDER; iz++) {
int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0); int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k; int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
real2 tq = pmeGrid[gridIndex]; real2 tq = pmeGrid[gridIndex];
real4 tadd = theta1[ix]; real4 tadd = theta3[iz];
t0_1 += tq.x*tadd.x; t0_1 += tq.x*tadd.x;
t1_1 += tq.x*tadd.y; t1_1 += tq.x*tadd.y;
t2_1 += tq.x*tadd.z; t2_1 += tq.x*tadd.z;
...@@ -763,70 +714,70 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri ...@@ -763,70 +714,70 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
t2_2 += tq.y*tadd.z; t2_2 += tq.y*tadd.z;
t3 += (tq.x+tq.y)*tadd.w; t3 += (tq.x+tq.y)*tadd.w;
} }
tu00_1 += t0_1*u.x; tu00_1 += u.x*t0_1;
tu10_1 += t1_1*u.x; tu10_1 += u.y*t0_1;
tu01_1 += t0_1*u.y; tu01_1 += u.x*t1_1;
tu20_1 += t2_1*u.x; tu20_1 += u.z*t0_1;
tu11_1 += t1_1*u.y; tu11_1 += u.y*t1_1;
tu02_1 += t0_1*u.z; tu02_1 += u.x*t2_1;
tu00_2 += t0_2*u.x; tu00_2 += u.x*t0_2;
tu10_2 += t1_2*u.x; tu10_2 += u.y*t0_2;
tu01_2 += t0_2*u.y; tu01_2 += u.x*t1_2;
tu20_2 += t2_2*u.x; tu20_2 += u.z*t0_2;
tu11_2 += t1_2*u.y; tu11_2 += u.y*t1_2;
tu02_2 += t0_2*u.z; tu02_2 += u.x*t2_2;
real t0 = t0_1 + t0_2; real t0 = t0_1 + t0_2;
real t1 = t1_1 + t1_2; real t1 = t1_1 + t1_2;
real t2 = t2_1 + t2_2; real t2 = t2_1 + t2_2;
tu00 += t0*u.x; tu00 += u.x*t0;
tu10 += t1*u.x; tu10 += u.y*t0;
tu01 += t0*u.y; tu01 += u.x*t1;
tu20 += t2*u.x; tu20 += u.z*t0;
tu11 += t1*u.y; tu11 += u.y*t1;
tu02 += t0*u.z; tu02 += u.x*t2;
tu30 += t3*u.x; tu30 += u.w*t0;
tu21 += t2*u.y; tu21 += u.z*t1;
tu12 += t1*u.z; tu12 += u.y*t2;
tu03 += t0*u.w; tu03 += u.x*t3;
} }
tuv100_1 += tu10_1*v.x; tuv100_1 += v.y*tu00_1;
tuv010_1 += tu01_1*v.x; tuv010_1 += v.x*tu10_1;
tuv001_1 += tu00_1*v.y; tuv001_1 += v.x*tu01_1;
tuv200_1 += tu20_1*v.x; tuv200_1 += v.z*tu00_1;
tuv020_1 += tu02_1*v.x; tuv020_1 += v.x*tu20_1;
tuv002_1 += tu00_1*v.z; tuv002_1 += v.x*tu02_1;
tuv110_1 += tu11_1*v.x; tuv110_1 += v.y*tu10_1;
tuv101_1 += tu10_1*v.y; tuv101_1 += v.y*tu01_1;
tuv011_1 += tu01_1*v.y; tuv011_1 += v.x*tu11_1;
tuv100_2 += tu10_2*v.x; tuv100_2 += v.y*tu00_2;
tuv010_2 += tu01_2*v.x; tuv010_2 += v.x*tu10_2;
tuv001_2 += tu00_2*v.y; tuv001_2 += v.x*tu01_2;
tuv200_2 += tu20_2*v.x; tuv200_2 += v.z*tu00_2;
tuv020_2 += tu02_2*v.x; tuv020_2 += v.x*tu20_2;
tuv002_2 += tu00_2*v.z; tuv002_2 += v.x*tu02_2;
tuv110_2 += tu11_2*v.x; tuv110_2 += v.y*tu10_2;
tuv101_2 += tu10_2*v.y; tuv101_2 += v.y*tu01_2;
tuv011_2 += tu01_2*v.y; tuv011_2 += v.x*tu11_2;
tuv000 += tu00*v.x; tuv000 += v.x*tu00;
tuv100 += tu10*v.x; tuv100 += v.y*tu00;
tuv010 += tu01*v.x; tuv010 += v.x*tu10;
tuv001 += tu00*v.y; tuv001 += v.x*tu01;
tuv200 += tu20*v.x; tuv200 += v.z*tu00;
tuv020 += tu02*v.x; tuv020 += v.x*tu20;
tuv002 += tu00*v.z; tuv002 += v.x*tu02;
tuv110 += tu11*v.x; tuv110 += v.y*tu10;
tuv101 += tu10*v.y; tuv101 += v.y*tu01;
tuv011 += tu01*v.y; tuv011 += v.x*tu11;
tuv300 += tu30*v.x; tuv300 += v.w*tu00;
tuv030 += tu03*v.x; tuv030 += v.x*tu30;
tuv003 += tu00*v.w; tuv003 += v.x*tu03;
tuv210 += tu21*v.x; tuv210 += v.z*tu10;
tuv201 += tu20*v.y; tuv201 += v.z*tu01;
tuv120 += tu12*v.x; tuv120 += v.y*tu20;
tuv021 += tu02*v.y; tuv021 += v.x*tu21;
tuv102 += tu10*v.z; tuv102 += v.y*tu02;
tuv012 += tu01*v.z; tuv012 += v.x*tu12;
tuv111 += tu11*v.y; tuv111 += v.y*tu11;
} }
phid[m] = 0; phid[m] = 0;
phid[m+NUM_ATOMS] = tuv100_1; phid[m+NUM_ATOMS] = tuv100_1;
......
...@@ -24,13 +24,19 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4 ...@@ -24,13 +24,19 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
// code common to ZThenX and Bisector // code common to ZThenX and Bisector
int4 particles = multipoleParticles[atom]; int4 particles = multipoleParticles[atom];
if (particles.x >= 0 && particles.z >= 0) { if (particles.z >= 0) {
real4 thisParticlePos = posq[atom]; real4 thisParticlePos = posq[atom];
real4 posZ = posq[particles.z]; real4 posZ = posq[particles.z];
real3 vectorZ = make_real3(posZ.x-thisParticlePos.x, posZ.y-thisParticlePos.y, posZ.z-thisParticlePos.z); real3 vectorZ = make_real3(posZ.x-thisParticlePos.x, posZ.y-thisParticlePos.y, posZ.z-thisParticlePos.z);
real4 posX = posq[particles.x];
real3 vectorX = make_real3(posX.x-thisParticlePos.x, posX.y-thisParticlePos.y, posX.z-thisParticlePos.z);
int axisType = particles.w; int axisType = particles.w;
real4 posX;
real3 vectorX;
if (axisType >= 4)
vectorX = make_real3((real) 0.1f);
else {
posX = posq[particles.x];
vectorX = make_real3(posX.x-thisParticlePos.x, posX.y-thisParticlePos.y, posX.z-thisParticlePos.z);
}
/* /*
z-only z-only
...@@ -108,8 +114,6 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4 ...@@ -108,8 +114,6 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
} }
} }
else if (axisType >= 4)
vectorX = make_real3((real) 0.1f);
// x = x - (x.z)z // x = x - (x.z)z
...@@ -137,7 +141,7 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4 ...@@ -137,7 +141,7 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
// Check the chirality and see whether it needs to be reversed // Check the chirality and see whether it needs to be reversed
bool reverse = false; bool reverse = false;
if (axisType != 0 && particles.x >= 0 && particles.y >=0 && particles.z >= 0) { if (axisType == 0 && particles.x >= 0 && particles.y >=0 && particles.z >= 0) {
real4 posY = posq[particles.y]; real4 posY = posq[particles.y];
real delta[4][3]; real delta[4][3];
......
...@@ -612,7 +612,7 @@ extern "C" __global__ void computeElectrostatics( ...@@ -612,7 +612,7 @@ extern "C" __global__ void computeElectrostatics(
data.force = make_real3(0); data.force = make_real3(0);
data.torque = make_real3(0); data.torque = make_real3(0);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx); unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else #else
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
......
...@@ -284,8 +284,11 @@ vector<Vec3> setupWaterDimer(System& system, AmoebaMultipoleForce* amoebaMultip ...@@ -284,8 +284,11 @@ vector<Vec3> setupWaterDimer(System& system, AmoebaMultipoleForce* amoebaMultip
static void check_finite_differences(vector<Vec3> analytic_forces, Context &context, vector<Vec3> positions) static void check_finite_differences(vector<Vec3> analytic_forces, Context &context, vector<Vec3> positions)
{ {
// Take a small step in the direction of the energy gradient and see whether the potential energy changes by the expected amount. double tol = 1e-5;
// We allow more permissive testing for single precision.
if(Platform::getPlatformByName("CUDA").getPropertyValue(context, "Precision") != "double") tol = 5e-4;
// Take a small step in the direction of the energy gradient and see whether the potential energy changes by the expected amount.
double norm = 0.0; double norm = 0.0;
for (int i = 0; i < (int) analytic_forces.size(); ++i) for (int i = 0; i < (int) analytic_forces.size(); ++i)
norm += analytic_forces[i].dot(analytic_forces[i]); norm += analytic_forces[i].dot(analytic_forces[i]);
...@@ -303,7 +306,7 @@ static void check_finite_differences(vector<Vec3> analytic_forces, Context &cont ...@@ -303,7 +306,7 @@ static void check_finite_differences(vector<Vec3> analytic_forces, Context &cont
State state2 = context.getState(State::Energy); State state2 = context.getState(State::Energy);
context.setPositions(positions3); context.setPositions(positions3);
State state3 = context.getState(State::Energy); State state3 = context.getState(State::Energy);
ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state3.getPotentialEnergy())/stepSize, 1e-4); ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state3.getPotentialEnergy())/stepSize, tol);
} }
...@@ -493,6 +496,8 @@ int main(int numberOfArguments, char* argv[]) { ...@@ -493,6 +496,8 @@ int main(int numberOfArguments, char* argv[]) {
try { try {
registerAmoebaCudaKernelFactories(); registerAmoebaCudaKernelFactories();
if (numberOfArguments > 1)
Platform::getPlatformByName("CUDA").setPropertyDefaultValue("Precision", std::string(argv[1]));
/* /*
* Water dimer energy / force tests under various conditions. * Water dimer energy / force tests under various conditions.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment